| a | b | c |
|---|---|---|
| 2014 | B | 9 |
| 2015 | A | 8 |
| 2014 | A | 10 |
| 2015 | B | 7 |
输出结果
| a | col_a | col_b |
|---|---|---|
| 2014 | 10 | 9 |
| 2014 | 8 | 7 |
参考
-- **多行转多列**
select
a,
max(case when b="A" then c end) col_A,
max(case when b="B" then c end) col_B
from t1
group by a;
-- **多列转多行**
with t1_2 as (
select a,
max(case when b = "A" then c end) col_A,
max(case when b = "B" then c end) col_B
from t1
group by a
)
select a,
b,
c
from (
select a, "A" as b, col_a as c
from t1_2
union all
select a, "B" as b, col_b as c
from t1_2
) tmp
2. 排名中取最值 table:t2
| a | b | c |
|---|---|---|
| 2014 | A | 3 |
| 2014 | B | 1 |
| 2014 | C | 2 |
| 2015 | A | 4 |
| 2015 | D | 3 |
输出结果:按a分组取b字段最小时对应的c字段
| a | min_c |
|---|---|
| 2014 | 3 |
| 2015 | 4 |
参考
select
a,
c as min_c
from
(
select
a,
b,
c,
row_number() over(partition by a order by b) as rn
from t2
)a
where rn = 1;
输出结果:按a分组取b字段最小和最大时对应的c字段
| a | min_c | max_c |
|---|---|---|
| 2014 | 3 | 2 |
| 2015 | 4 | 3 |
参考
select
a,
min(if(asc_rn = 1, c, null)) as min_c,
max(if(desc_rn = 1, c, null)) as max_c
from
(
select
a,
b,
c,
row_number() over(partition by a order by b) as asc_rn,
row_number() over(partition by a order by b desc) as desc_rn
from t2
)a
where asc_rn = 1 or desc_rn = 1
group by a;
输出结果:按a分组取b字段前两小和前两大时对应的c字段
| a | min_c | max_c |
|---|---|---|
| 2014 | 3 , 1 | 2 , 1 |
| 2015 | 4 , 3 | 3 , 4 |
参考
select
tmp1.a as a,
min_c,
max_c
from
(
select
a,
concat_ws(',', collect_list(c)) as min_c
from
(
select
a,
b,
c,
row_number() over(partition by a order by b) as asc_rn
from t2
)a
where asc_rn <= 2
group by a
)tmp1
join
(
select
a,
concat_ws(',', collect_list(c)) as max_c
from
(
select
a,
b,
c,
row_number() over(partition by a order by b desc) as desc_rn
from t2
)a
where desc_rn <= 2
group by a
)tmp2
on tmp1.a = tmp2.a;
三、窗口大小控制 t3
| a | b | c |
|---|---|---|
| 2014 | A | 3 |
| 2014 | B | 1 |
| 2014 | C | 2 |
| 2015 | A | 4 |
| 2015 | D | 3 |
输出结果:按a分组按b字段排序,对c取前后各一行的和
| a | b | sum_c |
|---|---|---|
| 2014 | A | 1 |
| 2014 | B | 5 |
| 2014 | C | 1 |
| 2015 | A | 3 |
| 2015 | D | 4 |
参考
select a, b, lag(c,1,0) over(partition by a order by b)+lead(c,1,0) over(partition by a order by b) as sum_c from t3;
输出结果:按a分组按b字段排序,对c取平均值
select a, b, case when lag_c is null then c else (c+lag_c)/2 end as avg_c from ( select a, b, c, lag(c,1) over(partition by a order by b) as lag_c from t3 )temp;四、产生连续数值
输出结果:不借助其他任何外表,实现产生连续数值
1
2
3
4
5
…
10
参考一:
select
id_start+pos as id
from(
select
1 as id_start,
9 as id_end
) m lateral view posexplode(split(space(id_end-id_start), '')) t as pos, val
参考二:
select row_number() over() as id from (select split(space(9), ' ') as x) t lateral view explode(x) ex;五、数据扩充与收缩 t6
已知数据
a
3
2
4
输出结果:
| a | b |
|---|---|
| 3 | 3、2、1 |
| 2 | 2、1 |
| 4 | 4、3、2、1 |
参考
select
t.a,
concat_ws('、',collect_set(cast(t.rn as string))) as b
from
(
select
t6.a,
b.rn
from t6
left join
(
select
row_number() over() as rn
from
(select split(space(5), ' ') as x) t -- space(5)可根据t6表的最大值灵活调整
lateral view
explode(x) pe
) b
on 1 = 1
where t6.a >= b.rn
order by t6.a, b.rn desc
) t
group by t.a;
输出结果:数据扩充,排除偶数
| a | b |
|---|---|
| 3 | 3、1 |
| 2 | 1 |
| 4 | 3、1 |
参考
select
t.a,
concat_ws('、',collect_set(cast(t.rn as string))) as b
from
(
select
t6.a,
b.rn
from t6
left join
(
select
row_number() over() as rn
from
(select split(space(5), ' ') as x) t
lateral view
explode(x) pe
) b
on 1 = 1
where t6.a >= b.rn and b.rn % 2 = 1
order by t6.a, b.rn desc
) t
group by t.a;
输出结果:如果a字段有重复,如何实现字符串累计拼接
| a | b |
|---|---|
| 2 | 2 |
| 3 | 2、3 |
| 3 | 2、3、3 |
| 4 | 2、3、3、4 |
参考:
select
a,
b
from
(
select
t.a,
t.rn,
concat_ws('、',collect_list(cast(t.a1 as string))) as b
from
(
select
a.a,
a.rn,
b.a1
from
(
select
a,
row_number() over(order by a ) as rn
from t6
) a
left join
(
select a as a1,
row_number() over(order by a ) as rn
from t6
) b
on 1 = 1
where a.a >= b.a1 and a.rn >= b.rn
order by a.a, b.a1
) t
group by t.a,t.rn
order by t.a,t.rn
) tt;
数据展开:如何将字符串"1-5,16,11-13,9"扩展成"1,2,3,4,5,16,11,12,13,9"?
参考
select
concat_ws(',',collect_list(cast(rn as string)))
from
(
select
a.rn,
b.num,
b.pos
from
(
select
row_number() over() as rn
from (select split(space(20), ' ') as x) t -- space(20)可灵活调整
lateral view
explode(x) pe
) a lateral view outer
posexplode(split('1-5,16,11-13,9', ',')) b as pos, num
where a.rn between cast(split(num, '-')[0] as int) and cast(split(num, '-')[1] as int) or a.rn = num
order by pos, rn
) t;
六:合并与拆分 t7
| a | b |
|---|---|
| 2014 | A |
| 2014 | B |
| 2015 | B |
| 2015 | D |
输出结果:
| 2014 | A、 B |
|---|---|
| 2015 | B、D |
参考:
select
a,
concat_ws('、', collect_set(t.b)) b
from t7
group by a;
将分组合并的结果拆分出来
select
t.a,
d
from
(
select
a,
concat_ws('、', collect_set(t7.b)) b
from t7
group by a
)t
lateral view
explode(split(t.b, '、')) table_tmp as d;
七、时间函数-构造时间
| date | string | 日期 |
|---|---|---|
| d_week | string | 年内第几周 |
| weeks | int | 周几 |
| w_start | string | 周开始日 |
| w_end | string | 周结束日 |
| d_month | int | 第几月 |
| m_start | string | 月开始日 |
| m_end | string | 月结束日 |
| d_quarter | int | 第几季 |
| q_start | string | 季开始日 |
| q_end | string | 季结束日 |
| d_year | int | 年份 |
| y_start | string | 年开始日 |
| y_end | string | 年结束日 |
参考答案
drop table if exists dim_date;
create table if not exists dim_date(
`date` string comment '日期',
d_week string comment '年内第几周',
weeks string comment '周几',
w_start string comment '周开始日',
w_end string comment '周结束日',
d_month string comment '第几月',
m_start string comment '月开始日',
m_end string comment '月结束日',
d_quarter int comment '第几季',
q_start string comment '季开始日',
q_end string comment '季结束日',
d_year int comment '年份',
y_start string comment '年开始日',
y_end string comment '年结束日'
);
--自然月: 指每月的1号到那个月的月底,它是按照阳历来计算的。就是从每月1号到月底,不管这个月有30天,31天,29天或者28天,都算是一个自然月。
insert overwrite table dim_date
select `date`
, d_week --年内第几周
, case weekid
when 0 then '周日'
when 1 then '周一'
when 2 then '周二'
when 3 then '周三'
when 4 then '周四'
when 5 then '周五'
when 6 then '周六'
end as weeks -- 周
, date_add(next_day(`date`,'MO'),-7) as w_start --周一
, date_add(next_day(`date`,'MO'),-1) as w_end -- 周日_end
-- 月份日期
, concat('第', monthid, '月') as d_month
, m_start
, m_end
-- 季节
, quarterid as d_quart
, concat(d_year, '-', substr(concat('0', (quarterid - 1) * 3 + 1), -2), '-01') as q_start --季开始日
, date_sub(concat(d_year, '-', substr(concat('0', (quarterid) * 3 + 1), -2), '-01'), 1) as q_end --季结束日
-- 年
, d_year
, y_start
, y_end
from (
select `date`
, pmod(datediff(`date`, '2012-01-01'), 7) as weekid --获取周几
, cast(substr(`date`, 6, 2) as int) as monthid --获取月份
, case
when cast(substr(`date`, 6, 2) as int) <= 3 then 1
when cast(substr(`date`, 6, 2) as int) <= 6 then 2
when cast(substr(`date`, 6, 2) as int) <= 9 then 3
when cast(substr(`date`, 6, 2) as int) <= 12 then 4
end as quarterid --获取季节 可以直接使用 quarter(`date`)
, substr(`date`, 1, 4) as d_year -- 获取年份
, trunc(`date`, 'YYYY') as y_start --年开始日
, date_sub(trunc(add_months(`date`, 12), 'YYYY'), 1) as y_end --年结束日
, date_sub(`date`, dayofmonth(`date`) - 1) as m_start --当月第一天
, last_day(date_sub(`date`, dayofmonth(`date`) - 1)) m_end --当月最后一天
, weekofyear(`date`) as d_week --年内第几周
from (
-- '2021-04-01'是开始日期, '2022-03-31'是截止日期
select date_add('2021-04-01', t0.pos) as `date`
from (
select posexplode(
split(
repeat('o', datediff(
from_unixtime(unix_timestamp('2022-03-31', 'yyyy-mm-dd'),
'yyyy-mm-dd'),
'2021-04-01')), 'o'
)
)
) t0
) t1
) t2;
八、N指标–累计去重
| time_id | user_id |
|---|---|
| 2018-01-01 10:00:00 | 001 |
| 2018-01-01 11:03:00 | 002 |
| 2018-01-01 13:18:00 | 001 |
| 2018-01-02 08:34:00 | 004 |
| 2018-01-02 10:08:00 | 002 |
| 2018-01-02 10:40:00 | 003 |
| 2018-01-02 14:21:00 | 002 |
| 2018-01-02 15:39:00 | 004 |
| 2018-01-03 08:34:00 | 005 |
| 2018-01-03 10:08:00 | 003 |
| 2018-01-03 10:40:00 | 001 |
| 2018-01-03 14:21:00 | 005 |
输出结果:累计去重
| 日期 | 当日活跃人数 | 月累计活跃人数_截至当日 |
|---|---|---|
| date_id | user_cnt_act | user_cnt_act_month |
| 2018-01-01 | 2 | 2 |
| 2018-01-02 | 3 | 4 |
| 2018-01-03 | 3 | 5 |
参考:
SELECt tt1.date_id
,tt2.user_cnt_act
,tt1.user_cnt_act_month
FROM
( -- ④ 按照t.date_id分组求出user_cnt_act_month,得到tt1
SELECt t.date_id
,COUNT(user_id) AS user_cnt_act_month
FROM
( -- ③ 表a和表b进行笛卡尔积,按照a.date_id,b.user_id分组,保证截止到当日的用户唯一,得出表t。
SELECt a.date_id
,b.user_id
FROM
( -- ① 按照日期分组,取出date_id字段当主表的维度字段 得出表a
SELECt from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
FROM test.temp_tanhaidi_20211213_1
GROUP BY from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
) a
INNER JOIN
( -- ② 按照date_id、user_id分组,保证每天每个用户只有一条记录,得出表b
SELECt from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
,user_id
FROM test.temp_tanhaidi_20211213_1
GROUP BY from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
,user_id
) b
ON 1 = 1
WHERe a.date_id >= b.date_id
GROUP BY a.date_id
,b.user_id
) t
GROUP BY t.date_id
) tt1
LEFT JOIN
( -- ⑥ 按照date_id分组求出user_cnt_act,得到tt2
SELECt date_id
,COUNT(user_id) AS user_cnt_act
FROM
( -- ⑤ 按照日期分组,取出date_id字段当主表的维度字段 得出表a
SELECt from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd') AS date_id
,user_id
FROM test.temp_tanhaidi_20211213_1
GROUP BY from_unixtime(unix_timestamp(time_id),'yyyy-MM-dd')
,user_id
) a
GROUP BY date_id
) tt2
ON tt2.date_id = tt1.date_id



