得到visits模型
hadoop jar /export/data/mapreduce/web_log.jar cn.itcast.bigdata.weblog.clickstream.ClickStreamVisit
对于日志数据的分析,Hive也分为三层:ods层、dw层、app层
create database if not exists web_log_ods;
create database if not exists web_log_dw;
create database if not exists web_log_app;
drop table if exists web_log_ods.ods_weblog_origin;
create table web_log_ods.ods_weblog_origin(
valid string , --有效标记
remote_addr string, --访客ip
remote_user string, --访客用户信息
time_local string, --请求时间
request string, --请求url
status string, --响应状态码
body_bytes_sent string, --响应字节数
http_referer string, --来源url
http_user_agent string --访客终端信息
)
partitioned by (dt string)
row format delimited fields terminated by '\001';
drop table if exists web_log_ods.ods_click_pageviews;
create table web_log_ods.ods_click_pageviews(
session string, --会话id
remote_addr string, --访客ip
remote_user string, --访客用户信息
time_local string, --请求时间
request string, --请求url
visit_step string, --访问步长
page_staylong string, --页面停留时间(秒)
http_referer string, --来源url
http_user_agent string,--访客终端信息
body_bytes_sent string,--响应字节数
status string --响应状态码
)
partitioned by (dt string)
row format delimited fields terminated by '\001';
drop table if exists web_log_ods.ods_click_stream_visits;
create table web_log_ods.ods_click_stream_visits(
session string, --会话id
remote_addr string, --访客ip
inTime string, --会话访问起始时间
outTime string, --会话访问离开时间
inPage string, --会话访问起始页面
outPage string, --会话访问离开页面
referal string, --来源url
pageVisits int --会话页面访问数量
)
partitioned by (dt string)
row format delimited fields terminated by '\001';
load data inpath '/output/web_log/pre_web_log' overwrite into table web_log_ods.ods_weblog_origin partition(dt='2021-02-01');
load data inpath '/output/web_log/pageviews'overwrite into table web_log_ods.ods_click_pageviews partition(dt='2021-02-01');
load data inpath '/output/web_log/visits' overwrite into table web_log_ods.ods_click_stream_visits partition(dt='2021-02-01');
事实表的数据中,有些属性共同组成了一个字段(糅合在一起),比如年月日时分秒构成了时间,当需要根据某一属性进行分组统计的时候,需要截取拼接之类的操作,效率极低。
为了分析方便,可以事实表中的一个字段切割提取多个属性出来构成新的字段,因为字段变多了,所以称为宽表,原来的成为窄表。
又因为宽表的信息更加清晰明细,所以也可以称之为明细表。
drop table web_log_dw.dw_weblog_detail;
create table web_log_dw.dw_weblog_detail(
valid string, --有效标识
remote_addr string, --来源IP
remote_user string, --用户标识
time_local string, --访问完整时间
daystr string, --访问日期
timestr string, --访问时间
month string, --访问月
day string, --访问日
hour string, --访问时
request string, --请求的url
status string, --响应码
body_bytes_sent string, --传输字节数
http_referer string, --来源url
ref_host string, --来源的host
ref_path string, --来源的路径
ref_query string, --来源参数query
ref_query_id string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(dt string)
row format delimited fields terminated by '\001';
通过查询插入数据到明细宽表 dw_weblog_detail中,这里需要借助Hive中的内置函数parse_url_tuple对url进行解析,将以下sql存入: /export/data/hive_sql/web_log_detail.sql中
insert into table web_log_dw.dw_weblog_detail partition(dt='2021-02-01')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,1,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id
FROM web_log_ods.ods_weblog_origin a LATERAL VIEW
parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query,
ref_query_id) c;
执行sql抽取转换字段到中间表明细表 t_ods_tmp_detail hive -f ‘/export/data/hive_sql/web_log_detail.sql’
--浏览页面次数(pv)
select count(*) as pvs from web_log_dw.dw_weblog_detail where valid = true and dt='2021-02-01';
--独立访客(uv)
select count(distinct remote_addr) as uvs from web_log_dw.dw_weblog_detail where valid = true and dt='2021-02-01';
--访问次数(vv)
select count(session) from ods_click_stream_visits where dt='2021-02-01'
--基础指标入库
drop table if exists web_log_app.app_webflow_basic_info;
create table web_log_app.app_webflow_basic_info(date_val string,pvs bigint,uvs bigint,vvs bigint) partitioned by(dt string);
--允许笛卡尔积
set spark.sql.crossJoin.enabled=true;
insert into table web_log_app.app_webflow_basic_info partition(dt='2021-02-01')
select '2021-02-01',a.*,b.* from
(
select count(*) as pvs,count(distinct remote_addr) as uvs from web_log_dw.dw_weblog_detail where dt='2021-02-01'
) a
join
(
select count(session) as vvs from web_log_ods.ods_click_stream_visits where dt='2021-02-01'
) b;
--计算该处理批次(一天)中的各小时pvs
drop table web_log_app.app_pvs_everyhour_oneday;
create table web_log_app.app_pvs_everyhour_oneday(month string,day string,hour string,pvs bigint) partitioned by(dt string);
insert into table web_log_app.app_pvs_everyhour_oneday partition(dt='2021-02-01')
select a.month as month,a.day as day,a.hour as hour,count(*) as pvs from web_log_dw.dw_weblog_detail a
where a.dt='2021-02-01' group by a.month,a.day,a.hour;
--计算每天的pvs
drop table web_log_app.app_pvs_everyday;
create table web_log_app.app_pvs_everyday(pvs bigint,month string,day string);
insert into table web_log_app.app_pvs_everyday
select count(*) as pvs,a.month as month,a.day as day from web_log_dw.dw_weblog_detail a
group by a.month,a.day;
--复合指标统计分析
--人均浏览页数(平均访问深度)
--需求描述:统计今日所有来访者平均请求的页面数。
--总页面请求数pv/去重总人数uv
drop table web_log_app.app_avgpv_user_everyday;
create table web_log_app.app_avgpv_user_everyday(
day string,
avgpv string);
--方式一:
insert into table web_log_app.app_avgpv_user_everyday
select '2021-02-01',pv/uv from web_log_app.app_webflow_basic_info;
--方式二:
insert into table web_log_app.app_avgpv_user_everyday
select '2021-02-01',sum(b.pvs)/count(b.remote_addr) from
(select remote_addr,count(*) as pvs from web_log_dw.dw_weblog_detail where dt='2021-02-01' group by remote_addr) b;
--平均访问时长
--平均每次访问(会话)在网站上的停留时间。
--体现网站对访客的吸引程度。
--平均访问时长=访问总时长/访问次数。
--先计算每次会话的停留时长
select session, sum(page_staylong) as web_staylong from web_log_ods.ods_click_pageviews where dt='2021-02-01'
group by session;
--计算平均访问时长
select
sum(a.web_staylong)/count(a.session)
from
(select session, sum(page_staylong) as web_staylong from web_log_ods.ods_click_pageviews where dt='2021-02-01'
group by session) a;
--热门页面统计
--统计最热门的页面top10
drop table web_log_app.app_hotpages_everyday;
create table web_log_app.app_hotpages_everyday(day string,url string,pvs string);
--方式1
insert into table web_log_app.app_hotpages_everyday
select '2021-02-01',a.request,a.request_counts from
(select request as request,count(request) as request_counts
from web_log_dw.dw_weblog_detail where dt='2021-02-01' group by request having request is not null
) a
order by a.request_counts desc limit 10;
--方式2
insert into table web_log_app.app_hotpages_everyday
select * from
(
SELECT
'2021-02-01',a.request,a.request_counts,
RANK() OVER( ORDER BY a.request_counts desc) AS rn
FROM
(
select request as request,count(request) as request_counts
from web_log_dw.dw_weblog_detail where dt='2021-02-01' group by request having request is not null
)a
)b
where b.rn <= 10
;
转化,指网站业务流程中的一个封闭渠道,引导用户按照流程最终实现业务目标(比如商品成交);在这个渠道中,我们希望访问者一路向前,不要回头也不要离开,直到完成转化目标。漏斗模型则是指进入渠道的用户在各环节递进过程中逐渐流失的形象描述。
在一条指定的业务流程中,求出各个步骤的完成人数及相对上一个步骤的百分比。
定义好业务流程中的页面标识,下例中的步骤为:
Step1、 /item
Step2、 /category
Step3、 /index
Step4、 /order
load data local inpath '/export/data/hivedatas/click-part-r-00000' overwrite into table web_log_ods.ods_click_pageviews2 partition(dt='2021-02-01');
---1、查询每一个步骤的总访问人数
UNION All将多个SELECT语句的结果集合并为一个独立的结果集
create table web_log_app.app_oute_numbs as
select 'step1' as step,count(distinct remote_addr) as numbs from web_log_ods.ods_click_pageviews where dt='2021-02-01' and request like '/item%'
union all
select 'step2' as step,count(distinct remote_addr) as numbs from web_log_ods.ods_click_pageviews where dt='2021-02-01' and request like '/category%'
union all
select 'step3' as step,count(distinct remote_addr) as numbs from web_log_ods.ods_click_pageviews where dt='2021-02-01' and request like '/order%'
union all
select 'step4' as step,count(distinct remote_addr) as numbs from web_log_ods.ods_click_pageviews where dt='2021-02-01' and request like '/index%';
查询结果:
+---------------------+----------------------+--+
| dw_oute_numbs.step | dw_oute_numbs.numbs |
+---------------------+----------------------+--+
| step1 | 1029 |
| step2 | 1029 |
| step3 | 1028 |
| step4 | 1018 |
+---------------------+----------------------+--+
–2、查询每一步骤相对于路径起点人数的比例
--级联查询,自己跟自己join
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from web_log_app.app_oute_numbs rn
inner join
web_log_app.app_oute_numbs rr;
自join后结果如下图所示:
+---------+----------+---------+----------+--+
| rnstep | rnnumbs | rrstep | rrnumbs |
+---------+----------+---------+----------+--+
| step1 | 1029 | step1 | 1029 |
| step2 | 1029 | step1 | 1029 |
| step3 | 1028 | step1 | 1029 |
| step4 | 1018 | step1 | 1029 |
| step1 | 1029 | step2 | 1029 |
| step2 | 1029 | step2 | 1029 |
| step3 | 1028 | step2 | 1029 |
| step4 | 1018 | step2 | 1029 |
| step1 | 1029 | step3 | 1028 |
| step2 | 1029 | step3 | 1028 |
| step3 | 1028 | step3 | 1028 |
| step4 | 1018 | step3 | 1028 |
| step1 | 1029 | step4 | 1018 |
| step2 | 1029 | step4 | 1018 |
| step3 | 1028 | step4 | 1018 |
| step4 | 1018 | step4 | 1018 |
+---------+----------+---------+----------+--+
--每一步的人数/第一步的人数==每一步相对起点人数比例
select tmp.rnstep,tmp.rnnumbs/tmp.rrnumbs as abs_rate
from
(
select
*
from
web_log_app.app_oute_numbs t1
join
web_log_app.app_oute_numbs t2
on t2.step = 'step1';
) tmp
tmp
+---------+----------+---------+----------+--+
| rnstep | rnnumbs | rrstep | rrnumbs |
+---------+----------+---------+----------+--+
| step1 | 1029 | step1 | 1029 |
| step2 | 1029 | step1 | 1029 |
| step3 | 1028 | step1 | 1029 |
| step4 | 1018 | step1 | 1029 |
–3、查询每一步骤相对于上一步骤的漏出率
--首先通过自join表过滤出每一步跟上一步的记录
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from web_log_app.app_oute_numbs rn
inner join
web_log_app.app_oute_numbs rr
where cast(substr(rn.step,5,1) as int)=cast(substr(rr.step,5,1) as int)-1;
注意:cast为Hive内置函数 类型转换
select cast(1 as float); --1.0
select cast('2016-05-22' as date); --2016-05-22
+---------+----------+---------+----------+--+
| rnstep | rnnumbs | rrstep | rrnumbs |
+---------+----------+---------+----------+--+
| step1 | 1029 | step2 | 1029 |
| step2 | 1029 | step3 | 1028 |
| step3 | 1028 | step4 | 1018 |
+---------+----------+---------+----------+--+
–然后就可以非常简单的计算出每一步相对上一步的漏出率
select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as abs_rate
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from web_log_app.app_oute_numbs rn
inner join
web_log_app.app_oute_numbs rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
需要这份系统化资料的朋友,可以戳这里获取
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!
s rrstep,rr.numbs as rrnumbs from web_log_app.app_oute_numbs rn
inner join
web_log_app.app_oute_numbs rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;
[外链图片转存中...(img-dzhQiYqY-1714415734373)]
[外链图片转存中...(img-1ut6Y8Da-1714415734373)]
**网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。**
**[需要这份系统化资料的朋友,可以戳这里获取](https://bbs.csdn.net/topics/618545628)**
**一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!**