面试和工作中经常遇到,用sql/hive求出连续N天登录或者连续N天销售额破万的记录等。
数据准备--新建表与导入数据
use test;
create table if not exists log_info
(uid string,
log_date string)
row format delimited fields terminated by ',';
insert into table log_info values('A','20210901'),
('A','20210902'),('A','20210903')
,('B','20210901'),('B','20210903')
,('v','20210830'),('K','20210902')
,('C','20210903'),('B','20210902')
,('D','20210905');
hive> select * from log_info;
OK
A 20210901
A 20210902
A 20210903
B 20210901
B 20210903
v 20210830
K 20210902
C 20210903
B 20210902
D 20210905
思路一
--先将uid对组内的登录时间升序排名rank1,然后将日期与rank1相减。
--因为这两个标签都是按照时间递增,如果连续的话,值应该是为同一值。
select uid,count(1) as cnt
from
(select *
,row_number()over(partition by uid order by log_date asc) as rank
,date_sub(concat_ws('-',substr(log_date,1,4),substr(log_date,5,2),substr(log_date,7,2)),row_number()over(partition by uid order by log_date asc)) as rank2
from log_info
)a
group by uid
having cnt>=3;
--结果
OK
A 3
B 3
思路二
--使用lag和lead窗口函数,分别向上、向下取该uid的上次和下次登录时间,然后将该表join回原表,取出uid相等的行
select a.*,b.uid,c.uid
from
(select *
,lag(log_date,1)over(partition by uid order by log_date asc) as up_dt
,lead(log_date,1)over(partition by uid order by log_date asc) as down_dt
from log_info
)a join log_info b on a.up_dt=b.log_date
join log_info c on a.down_dt=c.log_date
where a.uid=b.uid and b.uid=c.uid;
--结果
OK
A 20210902 20210901 20210903 A A
B 20210902 20210901 20210903 B B



