gifshow.com 2019/01/01 5
yy.com 2019/01/01 4
huya.com 2019/01/01 1
gifshow.com 2019/01/20 6
gifshow.com 2019/02/01 8
yy.com 2019/01/20 5
gifshow.com 2019/02/02 7
gifshow.com 2019-01 11.0 11.0
gifshow.com 2019-02 15.0 26.0
huya.com 2019-01 1.0 1.0
yy.com 2019-01 9.0 9.0
public class DateFormat extends UDF {
public String evaluate(String date) {
String[] splits = date.split("/");
return splits[0] + "-" + splits[1];
}
}
# 临时
add jar /home/hadoop/lib/hdfs-train-1.0.jar;
CREATE TEMPORARY FUNCTION date_format_new AS "www.immoc.hive.udf.DateFormat";
# 永久
CREATE FUNCTION date_format_new AS "www.immoc.hive.udf.DateFormat" USING JAR 'hdfs://bigdata:9000/lib/hdfs-train-1.0.jar';
time _c1
2019/01/01 2019-01
2019/01/01 2019-01
2019/01/01 2019-01
2019/01/20 2019-01
2019/02/01 2019-02
2019/01/20 2019-01
2019/02/02 2019-02
select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time);
#结果
gifshow.com 2019-01 11.0
gifshow.com 2019-02 15.0
huya.com 2019-01 1.0
yy.com 2019-01 9.0
t1.domain = t2.domainselect t1.*,t2.* from
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t1
join
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t2
on
t1.domain = t2.domain;
# 结果
gifshow.com 2019-01 11.0 gifshow.com 2019-01 11.0
gifshow.com 2019-01 11.0 gifshow.com 2019-02 15.0
gifshow.com 2019-02 15.0 gifshow.com 2019-01 11.0
gifshow.com 2019-02 15.0 gifshow.com 2019-02 15.0
huya.com 2019-01 1.0 huya.com 2019-01 1.0
yy.com 2019-01 9.0 yy.com 2019-01 9.0
select t1.*,sum(t2.traffic) as total from
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t1
join
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t2
on
t1.domain = t2.domain
group by t1.domain,t1.date;
# 报错,FAILED: SemanticException [Error 10025]: Expression not in GROUP BY key traffic
# 需要将traffic字段也加到group by 之后,
select t1.*,sum(t2.traffic) as total from
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t1
join
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t2
on
t1.domain = t2.domain
group by t1.domain,t1.date,t1.traffic;
# 结果,
gifshow.com 2019-01 11.0 26.0
gifshow.com 2019-02 15.0 26.0
huya.com 2019-01 1.0 1.0
yy.com 2019-01 9.0 9.0
# 一个分组
gifshow.com 2019-01 11.0 gifshow.com 2019-01 11.0
gifshow.com 2019-01 11.0 gifshow.com 2019-02 15.0
# 一个分组
gifshow.com 2019-02 15.0 gifshow.com 2019-01 11.0
gifshow.com 2019-02 15.0 gifshow.com 2019-02 15.0
# 一个分组
huya.com 2019-01 1.0 huya.com 2019-01 1.0
# 一个分组
yy.com 2019-01 9.0 yy.com 2019-01 9.0
可以清楚的发现,在统计一月份分组的数据的时候,多出来了一条二月的数据,所以可以加一个where条件,根据sql语句的执行顺序,可以知道where在group by 之前执行,
所以加上条件where t1.date >= t2.date之后 ,分组情况
gifshow.com 2019-01 11.0 gifshow.com 2019-01 11.0
gifshow.com 2019-02 15.0 gifshow.com 2019-01 11.0
gifshow.com 2019-02 15.0 gifshow.com 2019-02 15.0
huya.com 2019-01 1.0 huya.com 2019-01 1.0
yy.com 2019-01 9.0 yy.com 2019-01 9.0
select t1.*,sum(t2.traffic) as total from
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t1
join
(select domain,date_format_new(time) as date,sum(traffic) as traffic from visits group by domain,date_format_new(time)) as t2
on
t1.domain = t2.domain
where t1.date >= t2.date
group by t1.domain,t1.date,t1.traffic;
# 结果
gifshow.com 2019-01 11.0 11.0
gifshow.com 2019-02 15.0 26.0
huya.com 2019-01 1.0 1.0
yy.com 2019-01 9.0 9.0