1,张三,US,CA 2,李四,US,CB 3,王五,CA,BB 4,赵六,CA,BC 5,老刘,AA,AA2. 用ROW_NUMBER()方法实现初始装载和定期装载
(1)建立初始装载脚本init_row_number.sql,内容如下:
USE test; -- 建立过渡表 DROP TABLE IF EXISTS tbl_stg; CREATE TABLE tbl_stg ( id INT, name STRING, cty STRING, st STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; -- 建立维度表 DROP TABLE IF EXISTS tbl_dim; CREATE TABLE tbl_dim ( sk INT, id INT, name STRING, cty STRING, st STRING, version INT, effective_date DATE, expiry_date DATE) CLUSTERED BY (id) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); -- 向过渡表加载初始数据 LOAD DATA LOCAL INPATH '/home/grid/BigDataDWTest/a.txt' INTO TABLE tbl_stg; -- 向维度表装载初始数据 INSERT INTO tbl_dim SELECT ROW_NUMBER() OVER (ORDER BY tbl_stg.id) + t2.sk_max, tbl_stg.*, 1, CAST('1900-01-01' AS DATE), CAST('2200-01-01' AS DATE) from tbl_stg CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2;(2)执行初始装载
hive -S -f /home/grid/BigDataDWTest/init_row_number.sql(3)修改数据文件a.txt,内容如下:
1,张,U,C 3,王五,CA,BB 4,赵六,AC,CB 5,刘,AA,AA 6,老杨,DD,DD说明:
(4)建立定期装载脚本scd_row_number.sql,内容如下:
USE test; -- 设置日期变量 SET hivevar:pre_date = DATE_ADD(CURRENT_DATE(),-1); SET hivevar:max_date = CAST('2200-01-01' AS DATE); -- 向过渡表加载更新后的数据 LOAD DATA LOCAL INPATH '/home/grid/BigDataDWTest/a.txt' OVERWRITE INTO TABLE tbl_stg; -- 向维度表装载更新后的数据 -- 设置已删除记录和SCD2的过期 UPDATE tbl_dim SET expiry_date = ${hivevar:pre_date} WHERE sk IN (SELECT a.sk FROM ( SELECT sk,id,name FROM tbl_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN tbl_stg b ON a.id=b.id WHERE b.id IS NULL OR a.name<>b.name); -- 处理SCD2新增行 INSERT INTO tbl_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.id) + t2.sk_max, t1.id, t1.name, t1.cty, t1.st, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.id id, t2.name name, t2.cty cty, t2.st st, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM tbl_dim t1 INNER JOIN tbl_stg t2 ON t1.id=t2.id AND t1.name<>t2.name AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN tbl_dim t3 ON T1.id = t3.id AND t3.expiry_date = ${hivevar:max_date} WHERE t3.sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2; -- 处理SCD1 -- 因为hive的update还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update -- 因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有cty或st改变的记录,而不是仅仅更新当前版本的记录 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS SELECT a.sk,a.id,a.name,b.cty,b.st,a.version,a.effective_date,a.expiry_date FROM tbl_dim a, tbl_stg b WHERE a.id=b.id AND (a.cty <> b.cty OR a.st <> b.st); DELETE FROM tbl_dim WHERE sk IN (SELECT sk FROM tmp); INSERT INTO tbl_dim SELECT * FROM tmp; -- 处理新增记录 INSERT INTO tbl_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.id) + t2.sk_max, t1.id, t1.name, t1.cty, t1.st, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM tbl_stg t1 LEFT JOIN tbl_dim t2 ON t1.id = t2.id WHERE t2.sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2;(5)执行定期装载
hive -S -f /home/grid/BigDataDWTest/scd_row_number.sql查询维度表结果如图1所示。
select * from tbl_dim order by id,version;
hive -S -f /home/grid/BigDataDWTest/scd_row_number.sql
hadoop dfs -put /home/grid/hive/lib/hive-contrib-2.0.0.jar /user
初始装载脚本init_UDFRowSequence.sql,内容如下:
USE test; ADD JAR hdfs:///user/hive-contrib-2.0.0.jar; CREATE TEMPORARY FUNCTION row_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; -- 建立过渡表 DROP TABLE IF EXISTS tbl_stg; CREATE TABLE tbl_stg ( id INT, name STRING, cty STRING, st STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; -- 建立维度表 DROP TABLE IF EXISTS tbl_dim; CREATE TABLE tbl_dim ( sk INT, id INT, name STRING, cty STRING, st STRING, version INT, effective_date DATE, expiry_date DATE) CLUSTERED BY (id) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); -- 向过渡表加载初始数据 LOAD DATA LOCAL INPATH '/home/grid/BigDataDWTest/a.txt' INTO TABLE tbl_stg; -- 向维度表装载初始数据 INSERT INTO tbl_dim SELECT t2.sk_max + row_sequence(), tbl_stg.*, 1, CAST('1900-01-01' AS DATE), CAST('2200-01-01' AS DATE) from tbl_stg CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2;定期装载脚本scd_UDFRowSequence.sql,内容如下:
USE test; ADD JAR hdfs:///user/hive-contrib-2.0.0.jar; CREATE TEMPORARY FUNCTION row_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; -- 设置日期变量 SET hivevar:pre_date = DATE_ADD(CURRENT_DATE(),-1); SET hivevar:max_date = CAST('2200-01-01' AS DATE); -- 向过渡表加载更新后的数据 LOAD DATA LOCAL INPATH '/home/grid/BigDataDWTest/a.txt' OVERWRITE INTO TABLE tbl_stg; -- 向维度表装载更新后的数据 -- 设置已删除记录和SCD2的过期 UPDATE tbl_dim SET expiry_date = ${hivevar:pre_date} WHERE sk IN (SELECT a.sk FROM ( SELECT sk,id,name FROM tbl_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN tbl_stg b ON a.id=b.id WHERE b.id IS NULL OR a.name<>b.name); -- 处理SCD2新增行 INSERT INTO tbl_dim SELECT t2.sk_max + row_sequence(), t1.id, t1.name, t1.cty, t1.st, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.id id, t2.name name, t2.cty cty, t2.st st, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM tbl_dim t1 INNER JOIN tbl_stg t2 ON t1.id=t2.id AND t1.name<>t2.name AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN tbl_dim t3 ON T1.id = t3.id AND t3.expiry_date = ${hivevar:max_date} WHERE t3.sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2; -- 处理SCD1 -- 因为hive的update还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update -- 因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有cty或st改变的记录,而不是仅仅更新当前版本的记录 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS SELECT a.sk,a.id,a.name,b.cty,b.st,a.version,a.effective_date,a.expiry_date FROM tbl_dim a, tbl_stg b WHERE a.id=b.id AND (a.cty <> b.cty OR a.st <> b.st); DELETE FROM tbl_dim WHERE sk IN (SELECT sk FROM tmp); INSERT INTO tbl_dim SELECT * FROM tmp; -- 处理新增记录 INSERT INTO tbl_dim SELECT t2.sk_max + row_sequence(), t1.id, t1.name, t1.cty, t1.st, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM tbl_stg t1 LEFT JOIN tbl_dim t2 ON t1.id = t2.id WHERE t2.sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(sk),0) sk_max FROM tbl_dim) t2;