pk, 20210801 pk, 20210802 pk, 20210803 pk, 20210804 pk, 20210806 pk, 20210807 pk, 20210808 pk, 20210811 pk, 20210812 ruoze,20210730 ruoze,20210731 ruoze,20210801 ruoze,20210804 ruoze,20210806最终结果
name cnt start_date end_date pk 4 20210801 20210804 ruoze 3 20210730 20210801建表语句
create table user_login(
name string,
login_date string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
导入数据到Hive
load data local inpath '/home/hadoop/data/user_login.txt' overwrite into table user_login;思路
特别注意:login_date 是string 类型,原数据有脏数据,去除空格。(用户pk的login_date前面有空格)
- 用户分组,登录日期递增排序,找到每行的排序值rn
- 登录日期减去rn -1, 按照得到的日期进行分组,求取count
- 最后rank() 排序=1, 找到最大的连续登录天数
select t3.name as name, t3.cnt as cnt, t3.start_date as start_date, t3.end_date as end_date from ( select t2.name, t2.cnt, t2.start_date, t2.end_date, rank() over(partition by name order by t2.cnt desc) as rn from( select t1.name, count(1) as cnt, min(t1.login_date) as start_date, max(t1.login_date) as end_date, datediff(t1.from_day, lag(t1.from_day, 1, t1.from_day) over(partition by t1.name order by t1.from_day)) interval_day from ( select name, trim(login_date) as login_date, date_sub(to_date(from_unixtime(UNIX_TIMESTAMP(trim(login_date), 'yyyyMMdd'))), ROW_NUMBER() over(partition by name order by login_date asc)) as from_day from -user_login ) as t1 group by t1.name, t1.from_day) t2 ) t3 where t3.rn = 1每一步的结果 1.1 第一步结果
执行结果如下
可以看到连续登录他们的from_day 都是相同的。
| name | login_date | from_day |
|---|---|---|
| pk | 20210801 | 2021-08-01 |
| pk | 20210802 | 2021-08-01 |
| pk | 20210803 | 2021-08-01 |
| pk | 20210804 | 2021-08-01 |
| pk | 20210806 | 2021-08-02 |
| pk | 20210807 | 2021-08-02 |
| pk | 20210808 | 2021-08-02 |
| pk | 20210811 | 2021-08-04 |
| pk | 20210812 | 2021-08-04 |
| ruoze | 20210730 | 2021-07-30 |
| ruoze | 20210731 | 2021-07-30 |
| ruoze | 20210801 | 2021-07-30 |
| ruoze | 20210804 | 2021-08-01 |
| ruoze | 20210806 | 2021-08-02 |
select name, trim(login_date) as login_date, date_sub(to_date(from_unixtime(UNIX_TIMESTAMP(trim(login_date), 'yyyyMMdd'))), ROW_NUMBER() over(partition by name order by login_date asc)-1) as from_day from user_login
-
功能:计算日期计算结果
语法: date_sub (string startdate, int days)
返回值: string
例如:date_sub(‘20211001’, 1) 结果:20210930
-
功能:去除空格
语法:trim([column])
返回值: [column]类型
例如:trim(’ 20211001’, 1) 结果:20211001
-
功能:排序 row_number 窗口函数
语法:ROW_NUMBER() over(partition by [column] order by [column] asc/desc)
返回值: int
例如:表user_login按name 分组,login_date 降序排列
name login_date pk 2021-08-01 pk 2021-08-02 ruoze 2021-07-30
select *, ROW_NUMBER() over(partition by name order by login_date desc) as rn from user_login
结果:
name login_date rn pk 2021-08-02 1 pk 2021-08-01 2 ruoze 2021-07-30 3
-
转化UNIX时间戳 from_unixtime
语法: from_unixtime(bigint unixtime[, stringformat])
返回值: string
例如:from_unixtime(1627776000);
结果:2021-08-01 00:00:00
-
返回string日期以某个格式形式的unixtime: unix_timestamp
语法: unix_timestamp(string date,string pattern) to_date
返回值: string(返回的是时间戳格式)
例如:select UNIX_TIMESTAMP(‘20210801’, ‘yyyyMMdd’) from dual
结果:1627776000
-
返回日期时间字段中的日期部分 to_date
语法: to_date(string timestamp)
返回值: string
例如:select to_date(‘2021-08-01 00:00:00’) from dual
结果:2021-08-01
-
求取连续登录天数,count() 函数
-
利用该分组,找到每个连续登录的开始结束时间
执行结果如下
| name | cnt | start_date | end_date |
|---|---|---|---|
| pk | 4 | 20210801 | 20210804 |
| pk | 3 | 20210806 | 20210808 |
| pk | 2 | 20210811 | 20210812 |
| ruoze | 3 | 20210730 | 20210801 |
| ruoze | 1 | 20210804 | 20210804 |
| ruoze | 1 | 20210806 | 20210806 |
select t1.name, count(1) as cnt, min(t1.login_date) as start_date, max(t1.login_date) as end_date from t1 --上一步的结果表 group by t1.name, t1.from_day3.1 第三步结果
对第二步的结果进行排序(倒序),按用户分组
| name | cnt | start_date | end_date | rank |
|---|---|---|---|---|
| pk | 4 | 20210801 | 20210804 | 1 |
| pk | 3 | 20210806 | 20210808 | 2 |
| pk | 2 | 20210811 | 20210812 | 3 |
| ruoze | 3 | 20210730 | 20210801 | 1 |
| ruoze | 1 | 20210804 | 20210804 | 2 |
| ruoze | 1 | 20210806 | 20210806 | 2 |
select t2.name, t2.cnt, t2.start_date, t2.end_date, rank() over(partition by name order by t2.cnt desc) as rank from t2 --第二步结果集4.1 第四步结果
rank=1 找出来就okk啦!!!!胜利在望
name cnt start_date end_date pk 4 20210801 20210804 ruoze 3 20210730 202108014.2 第四步SQL
select t3.name as name, t3.cnt as cnt, t3.start_date as start_date, t3.end_date as end_date from t3 --第三步结果 where t3.rank = 1知识点小结:
- 日期转化函数
- 窗口函数
- 聚合函数 group by (很容易出错)



