栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 前沿技术 > 大数据 > 大数据系统

Hive 高阶函数

Hive 高阶函数

1. 行列转换 table:t1
abc
2014B9
2015A8
2014A10
2015B7

输出结果

acol_acol_b
2014109
201487

参考

-- **多行转多列**
select 
    a,
    max(case when b="A" then c end) col_A,
    max(case when b="B" then c end) col_B
from t1 
group by a;
-- **多列转多行**
with t1_2 as (
    select a,
           max(case when b = "A" then c end) col_A,
           max(case when b = "B" then c end) col_B
    from t1
    group by a
)
select a,
       b,
       c
from (
         select a, "A" as b, col_a as c
         from t1_2
         union all
         select a, "B" as b, col_b as c
         from t1_2
     ) tmp
2. 排名中取最值 table:t2
abc
2014A3
2014B1
2014C2
2015A4
2015D3

输出结果:按a分组取b字段最小时对应的c字段

amin_c
20143
20154

参考

select
  a,
  c as min_c
from
(
      select
        a,
        b,
        c,
        row_number() over(partition by a order by b) as rn 
      from t2 
)a
where rn = 1;

输出结果:按a分组取b字段最小和最大时对应的c字段

amin_cmax_c
201432
201543

参考

select
  a,
  min(if(asc_rn = 1, c, null)) as min_c,
  max(if(desc_rn = 1, c, null)) as max_c
from
(
      select
        a,
        b,
        c,
        row_number() over(partition by a order by b) as asc_rn,
        row_number() over(partition by a order by b desc) as desc_rn 
      from t2 
)a
where asc_rn = 1 or desc_rn = 1
group by a;

输出结果:按a分组取b字段前两小和前两大时对应的c字段

amin_cmax_c
20143 , 12 , 1
20154 , 33 , 4

参考

select
  tmp1.a as a,
  min_c,
  max_c
from 
(
  select 
    a,
    concat_ws(',', collect_list(c)) as min_c
  from
    (
     select
       a,
       b,
       c,
       row_number() over(partition by a order by b) as asc_rn
     from t2
     )a
    where asc_rn <= 2 
    group by a 
)tmp1 
join 
(
  select 
    a,
    concat_ws(',', collect_list(c)) as max_c
  from
    (
     select
        a,
        b,
        c,
        row_number() over(partition by a order by b desc) as desc_rn 
     from t2
    )a
    where desc_rn <= 2
    group by a 
)tmp2 
on tmp1.a = tmp2.a; 
三、窗口大小控制 t3
abc
2014A3
2014B1
2014C2
2015A4
2015D3

输出结果:按a分组按b字段排序,对c取前后各一行的和

absum_c
2014A1
2014B5
2014C1
2015A3
2015D4

参考

select 
  a,
  b,
  lag(c,1,0) over(partition by a order by b)+lead(c,1,0) over(partition by a order by b) as sum_c
from t3;

输出结果:按a分组按b字段排序,对c取平均值

select
  a,
  b,
  case when lag_c is null then c
  else (c+lag_c)/2 end as avg_c
from
 (
 select
   a,
   b,
   c,
   lag(c,1) over(partition by a order by b) as lag_c
  from t3
 )temp;
四、产生连续数值

输出结果:不借助其他任何外表,实现产生连续数值
1
2
3
4
5

10
参考一:

select
id_start+pos as id
from(
    select
    1 as id_start,
    9 as id_end
) m  lateral view posexplode(split(space(id_end-id_start), '')) t as pos, val

参考二:

select
  row_number() over() as id
from  
  (select split(space(9), ' ') as x) t
lateral view
explode(x) ex;
五、数据扩充与收缩 t6

已知数据
a
3
2
4
输出结果:

ab
33、2、1
22、1
44、3、2、1

参考

select  
  t.a,
  concat_ws('、',collect_set(cast(t.rn as string))) as b
from
(  
  select  
    t6.a,
    b.rn
  from t6
  left join
  ( 
   select
     row_number() over() as rn
   from  
   (select split(space(5), ' ') as x) t -- space(5)可根据t6表的最大值灵活调整
   lateral view
   explode(x) pe
  ) b
  on 1 = 1
  where t6.a >= b.rn
  order by t6.a, b.rn desc 
) t
group by  t.a;

输出结果:数据扩充,排除偶数

ab
33、1
21
43、1

参考

select  
  t.a,
  concat_ws('、',collect_set(cast(t.rn as string))) as b
from
(  
  select  
    t6.a,
    b.rn
  from t6
  left join
  ( 
   select
     row_number() over() as rn
   from  
   (select split(space(5), ' ') as x) t
   lateral view
   explode(x) pe
  ) b
  on 1 = 1
  where t6.a >= b.rn and b.rn % 2 = 1
  order by t6.a, b.rn desc 
) t
group by  t.a;

输出结果:如果a字段有重复,如何实现字符串累计拼接

ab
22
32、3
32、3、3
42、3、3、4

参考:

select 
  a,
  b
from 
(
 select  
   t.a,
   t.rn,
   concat_ws('、',collect_list(cast(t.a1 as string))) as b
 from
  (   
    select  
     a.a,
     a.rn,
     b.a1
    from
    (
     select  
       a,
       row_number() over(order by a ) as rn 
     from t6
    ) a
    left join
    (   
     select  a as a1,
     row_number() over(order by a ) as rn  
     from t6
    ) b
    on 1 = 1
    where a.a >= b.a1 and a.rn >= b.rn 
    order by a.a, b.a1 
  ) t
  group by  t.a,t.rn
  order by t.a,t.rn
) tt; 

数据展开:如何将字符串"1-5,16,11-13,9"扩展成"1,2,3,4,5,16,11,12,13,9"?
参考

select  
  concat_ws(',',collect_list(cast(rn as string)))
from
(
  select  
   a.rn,
   b.num,
   b.pos
  from
   (
    select
     row_number() over() as rn
    from (select split(space(20), ' ') as x) t -- space(20)可灵活调整
    lateral view
    explode(x) pe
   ) a lateral view outer 
   posexplode(split('1-5,16,11-13,9', ',')) b as pos, num
   where a.rn between cast(split(num, '-')[0] as int) and cast(split(num, '-')[1] as int) or a.rn = num
   order by pos, rn 
) t;
六:合并与拆分 t7
ab
2014A
2014B
2015B
2015D

输出结果:

2014A、 B
2015B、D

参考:

select
  a,
  concat_ws('、', collect_set(t.b)) b
from t7
group by a;

将分组合并的结果拆分出来

select
  t.a,
  d
from
(
 select
  a,
  concat_ws('、', collect_set(t7.b)) b
 from t7
 group by a
)t
lateral view 
explode(split(t.b, '、')) table_tmp as d;
七、时间函数-构造时间
datestring日期
d_weekstring年内第几周
weeksint周几
w_startstring周开始日
w_endstring周结束日
d_monthint第几月
m_startstring月开始日
m_endstring月结束日
d_quarterint第几季
q_startstring季开始日
q_endstring季结束日
d_yearint年份
y_startstring年开始日
y_endstring年结束日

参考答案

drop table if exists dim_date;
create table if not exists dim_date(
    `date` string comment '日期',
    d_week string comment '年内第几周',
    weeks string comment '周几',
    w_start string comment '周开始日',
    w_end string comment '周结束日',
    d_month string comment '第几月',
    m_start string comment '月开始日',
    m_end string comment '月结束日',
    d_quarter int comment '第几季',
    q_start string comment '季开始日',
    q_end string comment '季结束日',
    d_year int comment '年份',
    y_start string comment '年开始日',
    y_end string comment '年结束日'
);
--自然月: 指每月的1号到那个月的月底,它是按照阳历来计算的。就是从每月1号到月底,不管这个月有30天,31天,29天或者28天,都算是一个自然月。

insert overwrite table dim_date
select `date`
     , d_week --年内第几周
     , case weekid
           when 0 then '周日'
           when 1 then '周一'
           when 2 then '周二'
           when 3 then '周三'
           when 4 then '周四'
           when 5 then '周五'
           when 6 then '周六'
    end  as weeks -- 周
     , date_add(next_day(`date`,'MO'),-7) as w_start --周一
     , date_add(next_day(`date`,'MO'),-1) as w_end   -- 周日_end
     -- 月份日期
     , concat('第', monthid, '月')  as d_month
     , m_start
     , m_end
     -- 季节
     , quarterid as d_quart
     , concat(d_year, '-', substr(concat('0', (quarterid - 1) * 3 + 1), -2), '-01') as q_start --季开始日
     , date_sub(concat(d_year, '-', substr(concat('0', (quarterid) * 3 + 1), -2), '-01'), 1) as q_end   --季结束日
     -- 年
     , d_year
     , y_start
     , y_end
from (
         select `date`
              , pmod(datediff(`date`, '2012-01-01'), 7)                  as weekid    --获取周几
              , cast(substr(`date`, 6, 2) as int)                        as monthid   --获取月份
              , case
                    when cast(substr(`date`, 6, 2) as int) <= 3 then 1
                    when cast(substr(`date`, 6, 2) as int) <= 6 then 2
                    when cast(substr(`date`, 6, 2) as int) <= 9 then 3
                    when cast(substr(`date`, 6, 2) as int) <= 12 then 4
             end                                                       as quarterid --获取季节 可以直接使用 quarter(`date`)
              , substr(`date`, 1, 4)                                     as d_year    -- 获取年份
              , trunc(`date`, 'YYYY')                                    as y_start   --年开始日
              , date_sub(trunc(add_months(`date`, 12), 'YYYY'), 1) as y_end     --年结束日
              , date_sub(`date`, dayofmonth(`date`) - 1)                 as m_start   --当月第一天
              , last_day(date_sub(`date`, dayofmonth(`date`) - 1))          m_end     --当月最后一天
              , weekofyear(`date`)                                       as d_week    --年内第几周
         from (
                    -- '2021-04-01'是开始日期, '2022-03-31'是截止日期
                  select date_add('2021-04-01', t0.pos) as `date`
                  from (
                           select posexplode(
                                          split(
                                                  repeat('o', datediff(
                                                          from_unixtime(unix_timestamp('2022-03-31', 'yyyy-mm-dd'),
                                                                        'yyyy-mm-dd'),
                                                          '2021-04-01')), 'o'
                                              )
                                      )
                       ) t0
              ) t1
     ) t2;
八、N指标–累计去重
time_iduser_id
2018-01-01 10:00:00001
2018-01-01 11:03:00002
2018-01-01 13:18:00001
2018-01-02 08:34:00004
2018-01-02 10:08:00002
2018-01-02 10:40:00003
2018-01-02 14:21:00002
2018-01-02 15:39:00004
2018-01-03 08:34:00005
2018-01-03 10:08:00003
2018-01-03 10:40:00001
2018-01-03 14:21:00005

输出结果:累计去重

日期当日活跃人数月累计活跃人数_截至当日
date_iduser_cnt_actuser_cnt_act_month
2018-01-0122
2018-01-0234
2018-01-0335

参考:

SELECt  tt1.date_id
       ,tt2.user_cnt_act
       ,tt1.user_cnt_act_month
FROM
(   -- ④ 按照t.date_id分组求出user_cnt_act_month,得到tt1
 SELECt  t.date_id
        ,COUNT(user_id) AS user_cnt_act_month
 FROM
 (   -- ③ 表a和表b进行笛卡尔积,按照a.date_id,b.user_id分组,保证截止到当日的用户唯一,得出表t。
  SELECt  a.date_id
         ,b.user_id
  FROM
  (   -- ① 按照日期分组,取出date_id字段当主表的维度字段 得出表a
   SELECt  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
   FROM test.temp_tanhaidi_20211213_1
   GROUP BY  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
  ) a
  INNER JOIN
  (   -- ② 按照date_id、user_id分组,保证每天每个用户只有一条记录,得出表b
   SELECt  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
          ,user_id
   FROM test.temp_tanhaidi_20211213_1
   GROUP BY  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
            ,user_id
  ) b
  ON 1 = 1
  WHERe a.date_id >= b.date_id
  GROUP BY  a.date_id
           ,b.user_id
 ) t
 GROUP BY  t.date_id
) tt1
LEFT JOIN
(   -- ⑥ 按照date_id分组求出user_cnt_act,得到tt2
 SELECt  date_id
        ,COUNT(user_id) AS user_cnt_act
 FROM
 (   -- ⑤ 按照日期分组,取出date_id字段当主表的维度字段 得出表a
  SELECt  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
         ,user_id
  FROM test.temp_tanhaidi_20211213_1
  GROUP BY  from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
           ,user_id
 ) a
 GROUP BY date_id
) tt2
ON tt2.date_id = tt1.date_id
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/747115.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号