{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
1、创建一张表,表中只有一个字段,这一个字段就是一条json数据。
2、通过get_json_obejct()
select get_json_obejct(data,'$.movie') movie,
get_json_obejct(data,'$.rate') rate,
get_json_obejct(data,'$.timeStamp') ts,
get_json_obejct(data,'$.uid') uid
from t_json
hive窗口聚合函数:
sum() over()
max() over()
avg() over()
min() over()
统计每个cookieid截止到当日总pv量:
sum(pv) over(partition by cookieid order by createtime)
sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row)
sum(pv) over(partition by cookieid order by createtime rows between 1 preceding and 2 following)
案例1实现:
create table t_access(
userid string,
dt string,
pv int
)
row format delimited
fields terminated by ",";
load data local inpath '/root/access.txt' into table t_access;
先计算每个用户每个月的总访问次数
select userid,dt,sum(pv) month_pv
from t_access
group by userid,dt;
接着计算每个用户截止到当月的最大访问次数和总访问次数。
select userid,dt,month_pv,
max(month_pv) over(partition by userid order by dt) max_pv,
sum(month_pv) over(partition by userid order by dt) sum_pv
from (
select userid,dt,sum(pv) month_pv
from t_access
group by userid,dt
) t1;
案例2:
create table t_store(
dt string,
brand string,
price bigint
)
row format delimited
fields terminated by ",";
load data local inpath '/root/data.txt' into table t_store;
select brand,month,month_price,
sum(month_price) over(partition by brand order by month) sum_price,
max(month_price) over(partition by brand order by month) max_price
from(
select brand,date_format(dt,'yyyy-MM') month,sum(price) month_price
from t_store
group by brand,date_format(dt,'yyyy-MM')
) t1;
窗口分片操作:
场景:取窗口内的前几分之几。
create table cookie2(cookieid string, createtime string, pv int)
row format delimited
fields terminated by ',';
load data local inpath "/root/cookie2.txt" into table cookie2;
select cookieid,createtime,pv,
ntile(2) over(partition by cookieid order by createtime) nt2,
ntile(3) over(partition by cookieid order by createtime) nt3
from cookie2;
窗口排序:
分组TopN:计算每个cookeid pv最大前三名。
100 99 98 98 97 96
row_number() over() 1 2 3 4 5 6
rank() over() 1 2 3 3 5 6
dense_rank() over() 1 2 3 3 4 5
select *
from(
select cookieid,pv,
row_number() over(partition by cookieid order by pv desc) rn
from cookie2
) temp
where rn <= 3;



