数据说明
1.订单表 order
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string,
days_since_prior_order string
2.prior表
order_id,订单id
product_id,商品id
add_to_cart_order,订单支付先后的位置
reordered,当前商品是否为重复下单的行为
create table priors(
order_id string,
product_id string,
add_to_cart_order string,
reordered string
1.每个用户的订单数
SELECT user_id,count(distinct order_id) as num
FROM [orders]
group by user_id
number of mappers: 1; number of reducers: 1
原因(order表就103M,不到256M,只启动了一个mapreduce)
select USER_ID,max(CAST(order_number as int)) as num
from orders
group by user_id
查看 set dfs.block.size=128M
调整 set mapred.max.split.size;512000000
再次看mapreduce个数(第一段代码):
number of mappers: 2; number of reducers: 3
原因:set hive.exec.reducers.bytes.per.reducer 是256M,所以有3个reducer
set mapred.max.split.size;512000000 所以有2个map
conclusion:
如果mapred.max.split.size(120)
2. 每个用户平均每个订单是多少商品
--1.每个用户有多少订单
select user_id,count(order_id) as order_count
from orders
group by user_id
order by user_id
--2.每个订单有多少商品
select order_id,count(product_id) as order2product_count
from order_products__prior
group by order_id
--3.join order表
select user_id,avg(cast(t.order2product_count as float)) as avg
from orders od join (select order_id,count(product_id) as order2product_count
from priors
group by order_id)t
on od.order_id=t.order_id
group by user_id
order by user_id
3. 每个用户在一周中的购买订单的分布情况
select
user_id,
sum(case order_dow when '0' then 1 else 0 end) dow_0,
sum(case order_dow when '1' then 1 else 0 end) dow_1,
sum(case order_dow when '2' then 1 else 0 end) dow_2,
sum(case order_dow when '3' then 1 else 0 end) dow_3,
sum(case order_dow when '4' then 1 else 0 end) dow_4,
sum(case order_dow when '5' then 1 else 0 end) dow_5,
sum(case order_dow when '6' then 1 else 0 end) dow_6
from orders
group by user_id
limit 20;
4. 一个用户平均每个月购买多少个商品(30天一个月)平均每30天
--每个用户平均每个订单有多少商品
--1.每个用户有多少订单
select user_id,count(order_id) as order_count
from orders
group by user_id
order by user_id
--2.每个订单有多少商品
select order_id,count(product_id) as order2product_count
from order_products__prior
group by order_id
--3.join order表
select user_id,avg(cast(t.order2product_count as float)) as avg
from orders od join (select order_id,count(product_id) as order2product_count
from order_products__prior
group by order_id)t
on od.order_id=t.order_id
group by user_id
order by user_id
-------------
select
user_id,
sum(case order_dow when '0' then 1 else 0 end) dow_0,
sum(case order_dow when '1' then 1 else 0 end) dow_1,
sum(case order_dow when '2' then 1 else 0 end) dow_2,
sum(case order_dow when '3' then 1 else 0 end) dow_3,
sum(case order_dow when '4' then 1 else 0 end) dow_4,
sum(case order_dow when '5' then 1 else 0 end) dow_5,
sum(case order_dow when '6' then 1 else 0 end) dow_6
from orders
group by user_id
--一个用户平均每个月购买多少个商品(30天一个月)平均每30天
--1.每个用户总共多少商品
select user_id,sum(cast(t.order2product_count as float)) as asum
from orders od join (select order_id,count(product_id) as order2product_count
from order_products__prior
group by order_id)t
on od.order_id=t.order_id
group by user_id
order by user_id
--2.每个用户经历了几个月
select user_id,ceiling(sum(cast(days_since_prior_order as float))/30) as sum1
from orders
group by user_id
having ceiling(sum(cast(days_since_prior_order as float))/30) !=0
order by user_id
--3.join
select t1.user_id,t1.asum/t2.sum
from (select user_id,sum(cast(t.order2product_count as float)) as asum
from orders od join (select order_id,count(product_id) as order2product_count
from order_products__prior
group by order_id)t
on od.order_id=t.order_id
group by user_id)t1 join (select user_id,ceiling(sum(cast(days_since_prior_order as float))/30) as sum
from orders
group by user_id having ceiling(sum(cast(days_since_prior_order as float))/30) !=0)t2 on t1.user_id=t2.user_id
order by user_id
3.每个用户最喜欢的三个商品(组内排序)
select * from(
select user_id,product_id,count(*) as count,ROW_NUMBER() over(partition by user_id order by count(*) desc) as row1
from orders od join order_products__prior opp
on od.order_id=opp.order_id
group by user_id,product_id)t
where t.row1<=3
order by user_id
hive后续:
1.hive优化
2.UDF