1.创建mysql表保存mysql以及hive的配置信息
此配置信息表字段包括但不限于以下字段
create table `t_auto_mysql2hive_info` (
`id` int UNSIGNED AUTO_INCREMENT PRIMARY KEY COMMENT '自增id',
`author` varchar(64) NOT NULL DEFAULT '' COMMENT 'author',
`remarks` varchar(256) NOT NULL DEFAULT '' COMMENT '注释',
`mysql_host` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql_host',
`mysql_port` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql_port',
`mysql_user` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql_user',
`mysql_password` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql_password',
`mysql_database` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql库',
`mysql_table` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql表',
`mysql_field` varchar(64) NOT NULL DEFAULT '' COMMENT 'mysql增量字段',
`hive_database` varchar(64) NOT NULL DEFAULT '' COMMENT 'hive库',
`hive_table` varchar(64) NOT NULL DEFAULT '' COMMENT 'hive表',
`import_type` int NOT NULL DEFAULT '0' COMMENT '导入方式0-增量,1-全量',
`execution_cycle` varchar(32) NOT NULL DEFAULT 'day' COMMENT '脚本执行周期,0-全量执行一次,1-全量执行完成一次,hour-小时级,day-天级,week-周级,month-月级',
`is_mesos` int NOT NULL DEFAULT '0' COMMENT '是否配置调度,0-未配置,1-已配置',
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='自动导入hive表的记录表'
2.向配置信息表中插入一条需要导入表的配置
表test需要每天增量导入hive
INSERT INTO t_auto_mysql2hive_info ( author,remarks,mysql_host,mysql_port,mysql_user,mysql_password,mysql_database,mysql_table,mysql_field,hive_database,hive_table,import_type,execution_cycle,created_at )
VALUES ( 'biubiubiu','测试表','10.0.0.1','3306','root','root','db_test','test','updated_at','db_tmp','tmp_db_test_t_test_d',0,'day',CURRENT_TIMESTAMP );
3.写一个方法获取所需导入表配置信息(shell脚本)
function get_mysql_config(){
sql1="select author,remarks,mysql_host,mysql_port,mysql_user,mysql_password,mysql_database,mysql_table,mysql_field,hive_database,hive_table,created_at,import_type,execution_cycle from t_auto_mysql2hive_info where id = $1"
info=`mysql -h 10.0.0.1 -root -proot test -N -e "${sql1}"`
#获取查询到的配置信息
array=(${info// / })
author=${array[0]}
remarks=${array[1]}
mysql_host=${array[2]}
mysql_port=${array[3]}
mysql_user=${array[4]}
mysql_password=${array[5]}
mysql_database=${array[6]}
mysql_table=${array[7]}
mysql_field=${array[8]}
hive_database=${array[9]}
hive_table=${array[10]}
created_at=${array[11]:0:10}
import_type=${array[13]}
execution_cycle=${array[14]}
}
4.写一个方法获取mysql表对应列名及字段类型及注释
(为了避免MySQL表中注释含有特殊字符对字符串切分的影响,直接在查询时拼凑出一条 建表字段语句:如" id int comment '用户id', " 并去掉最后一条数据的逗号) - 。-好吧其实是没想到其他更好的办法拼建表语句
#获取mysql表对应列名及字段类型及注释
function get_table_column() {
sql2="select column_name,data_type,\"comment '\",column_comment,\"',\" from information_schema.columns where table_name='${mysql_table}'"
create=`mysql -h ${mysql_host} -u${mysql_user} -p${mysql_password} ${mysql_database} -N -e "${sql2}"`
#将mysql中字段类型转为hive支持类型
varchar=${create//varchar/string}
timestamp=${varchar//timestamp/string}
datetime=${timestamp//datetime/string}
mediumtext=${datetime//mediumtext/string}
text=${mediumtext//text/string}
column=${text%,*}
echo ${column}
}
5.拼接建表语句并在hive中执行
(根据业务需求,增量导入表为变化表,历史数据可能发生改变,需在数仓第二层合并取最新数据)
#在hive中创建对应表
function create_hive_table() {
hql1="CREATE TABLE IF NOT EXISTS ${hive_database}.${hive_table}(
`get_table_column`
)
PARTITIONED BY (
statdate string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
"
echo ${hql1}
hive -e "${hql1}"
hql2="CREATE TABLE IF NOT EXISTS db_middle.m1_${mysql_table}_dt(
`get_table_column`
)
PARTITIONED BY (
statdate string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
"
if [ ${import_type} = 1 ];then
echo "全量导入,无中间表"
else
echo ${hql2}
hive -e "${hql2}"
fi
}