目标:统计当日、当周、当月活跃用户
活跃用户指的是在统计周期内由过通话记录的用户
1)建表语句
hive (calllogs)> drop table if exists dws_au_detail_day; create table dws_au_detail_day( sys_id string, service_name string, home_province_code string, visit_province_code string, channel_code string, service_code string, imsi string, msisdn string ) COMMENT '活跃用户按天明细' PARTITIonED BY ( `dt` string) stored as parquet location '/warehouse/calllogs/dws/dws_au_detail_day/';
2)数据导入
以用户手机号为key进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_day partition(dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
'2021-12-07'
from dwd_calllogs_cdr
where dt='2021-12-07'
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_day limit 5; hive (calllogs)> select count(*) from dws_au_detail_day;1.1.2 每周活跃用户明细
1)建表语句
hive (calllogs)> drop table if exists dws_au_detail_week; create table dws_au_detail_week( sys_id string, service_name string, home_province_code string, visit_province_code string, channel_code string, service_code string, imsi string, msisdn string, monday_date string, sunday_date string ) COMMENT '活跃用户按周明细' PARTITIonED BY ( `week_dt` string) stored as parquet location '/warehouse/calllogs/dws/dws_au_detail_week/';
2)数据导入
以用户手机号为key按周进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_week partition(week_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_add(next_day('2021-12-07','MO'),-7),
date_add(next_day('2021-12-07','MO'),-1),
concat(date_add(next_day('2021-12-07','MO'),-7),'-',date_add(next_day('2021-12-07','MO'),-1))
from dwd_calllogs_cdr
where dt >= date_add(next_day('2021-12-07','MO'),-7) and dt <= date_add(next_day('2021-12-07','MO'),-1)
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_week limit 5; hive (calllogs)> select count(*) from dws_au_detail_week;1.1.3 每月活跃用户明细
1)建表语句
hive (calllogs)> drop table if exists dws_au_detail_month; create table dws_au_detail_month( sys_id string, service_name string, home_province_code string, visit_province_code string, channel_code string, service_code string, imsi string, msisdn string ) COMMENT '活跃用户按月明细' PARTITIonED BY ( `month_dt` string) stored as parquet location '/warehouse/calllogs/dws/dws_au_detail_month/';
2)数据导入
以用户手机号为key按周进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_month partition(month_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_format('2021-12-07','yyyy-MM')
from dwd_calllogs_cdr
where date_format(dt,'yyyy-MM') = date_format('2021-12-07','yyyy-MM')
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_month limit 5; hive (calllogs)> select count(*) from dws_au_detail_month;
1.1.4 DWS层活跃用户加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_dws_au.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".dws_au_detail_day partition(dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
'$calllogs_date'
from "$DB".dwd_calllogs_cdr
where dt='$calllogs_date'
group by msisdn;
insert overwrite table "$DB".dws_au_detail_week partition(week_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_add(next_day('$calllogs_date','MO'),-7),
date_add(next_day('$calllogs_date','MO'),-1),
concat(date_add(next_day('$calllogs_date','MO'),-7),'_',date_add(next_day('$calllogs_date','MO'),-1))
from "$DB".dwd_calllogs_cdr
where dt >= date_add(next_day('$calllogs_date','MO'),-7) and dt <= date_add(next_day('$calllogs_date','MO'),-1)
group by msisdn;
insert overwrite table "$DB".dws_au_detail_month partition(month_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_format('$calllogs_date','yyyy-MM')
from "$DB".dwd_calllogs_cdr
where date_format(dt,'yyyy-MM') = date_format('$calllogs_date','yyyy-MM')
group by msisdn;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_dws_au.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs_dws_au.sh 2021-12-08
4)查询结果
hive (calllogs)> select count(*) from dws_au_detail_day; hive (calllogs)> select count(*) from dws_au_detail_week; hive (calllogs)> select count(*) from dws_au_detail_month ;
5)脚本执行时间
企业开发中一般在每日凌晨30分~1点
目标:当日、当周、当月活跃设备数
1)建表语句
hive (calllogs)> drop table if exists ads_au_count; create external table ads_au_count( `dt` string COMMENT '统计日期', `day_count` bigint COMMENT '当日用户数量', `wk_count` bigint COMMENT '当周用户数量', `mn_count` bigint COMMENT '当月用户数量', `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果', `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果' ) COMMENT '每日活跃用户数量' stored as parquet location '/warehouse/calllogs/ads/ads_au_count/';
2)导入数据
hive (calllogs)>
insert overwrite table ads_au_count
select
'2021-12-07' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('2021-12-07','MO'),-1)='2021-12-07','Y','N') ,
if(last_day('2021-12-07')='2021-12-07','Y','N')
from
(
select
'2021-12-07' dt,
count(*) ct
from dws_au_detail_day
where dt='2021-12-07'
)daycount join
(
select
'2021-12-07' dt,
count (*) ct
from dws_au_detail_week
where week_dt=concat(date_add(next_day('2021-12-07','MO'),-7),'_' ,date_add(next_day('2021-12-07','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'2021-12-07' dt,
count (*) ct
from dws_au_detail_month
where month_dt=date_format('2021-12-07','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
3)查询导入结果
hive (calllogs)> select * from ads_au_count;1.2.1 活跃用户数ADS层加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_ads_au.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".ads_au_count
select
'$calllogs_date' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('$calllogs_date','MO'),-1)='$calllogs_date','Y','N') ,
if(last_day('$calllogs_date')='$calllogs_date','Y','N')
from
(
select
'$calllogs_date' dt,
count(*) ct
from "$DB".dws_au_detail_day
where dt='$calllogs_date'
)daycount join
(
select
'$calllogs_date' dt,
count (*) ct
from "$DB".dws_au_detail_week
where week_dt=concat(date_add(next_day('$calllogs_date','MO'),-7),'_' ,date_add(next_day('$calllogs_date','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'$calllogs_date' dt,
count (*) ct
from "$DB".dws_au_detail_month
where month_dt=date_format('$calllogs_date','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_ads_au.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs_ads_au.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from ads_au_count ;2 新增用户
新增用户包括日新增用户、周新增用户、月新增用户。
2.1 DWS层 2.1.1每日新增用户明细表1)建表语句
hive (calllogs)> drop table if exists `dws_new_user_day`; create table `dws_new_user_day` ( sys_id string, service_name string, home_province_code string, visit_province_code string, channel_code string, service_code string, imsi string, msisdn string, create_date string ) COMMENT '每日新增用户信息' stored as parquet location '/warehouse/calllogs/dws/dws_new_user_day/';
2)导入数据
用每日活跃用户表 left join 每日新增设备表,关联的条件是msisdn相等。如果是每日新增的用户,则在每日新增用户表中为null。
hive (calllogs)> insert into table dws_new_user_day select aud.sys_id, aud.service_name, aud.home_province_code, aud.visit_province_code, aud.channel_code, aud.service_code, aud.imsi, aud.msisdn, '2021-12-07' from dws_au_detail_day aud left join dws_new_user_day nud on aud.msisdn=nud.msisdn where aud.dt='2021-12-07' and nud.msisdn is null;
3)查询导入数据
hive (calllogs)> select count(*) from dws_new_user_day ;2.1.2 DWS层每日新增用户数据加载脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs.dws_nu.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$DB".dws_new_user_day
select
aud.sys_id,
aud.service_name,
aud.home_province_code,
aud.visit_province_code,
aud.channel_code,
aud.service_code,
aud.imsi,
aud.msisdn,
'$calllogs_date'
from "$DB".dws_au_detail_day aud left join "$DB".dws_new_user_day nud on aud.msisdn=nud.msisdn
where aud.dt='$calllogs_date' and nud.msisdn is null;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs.dws_nu.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs.dws_nu.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from dws_new_user_day ;2.2 ADS层 2.2.1每日新增用户表
1)建表语句
hive (calllogs)> drop table if exists `ads_new_user_count`; create table `ads_new_user_count` ( `create_date` string comment '创建时间' , `new_user_count` BIGINT comment '新增用户数量' ) COMMENT '每日新增用户数' row format delimited fields terminated by 't' location '/warehouse/calllogs/ads/ads_new_user_count/';
2)导入数据
hive (calllogs)> insert into table ads_new_user_count select create_date, count(*) from dws_new_user_day where create_date='2021-12-07' group by create_date ;
3)查询导入数据
hive (calllogs)> select * from ads_new_user_count;2.2.2每日新增用户表加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs.ads_nu.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$DB".ads_new_user_count
select create_date, count(*) from "$DB".dws_new_user_day
where create_date='$calllogs_date'
group by create_date ;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs.ads_nu.sh
3)脚本使用
[hadoop@hadoop101 bin]$ calllogs.ads_nu.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from ads_new_user_count;3 留存用户 3.1 用户留存概念
留存用户:某段时间内的新增用户(活跃用户),经过一段时间后,又继续使用应用的被认作是留存用户;
留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率。
例如,2月10日新增用户100,这100人在2月11日启动过应用的有30人,2月12日启动过应用的有25人,2月13日启动过应用的有32人;
则2月10日新增用户次日的留存率是30/100 = 30%,两日留存率是25/100=25%,三日留存率是32/100=32%。
分析:假设今天是11日,要统计前1天也就是10日新增用户的留存率,公式如下:
10日新增用户的留存率= 10日的新增用户且11日活跃的 / 10日的新增用户
1)分母获取
(1)创建每日新增用户明细表
(2)通过每日新增用户明细表获取10日新增用户
2)分子获取
(1)10日新增的用户明细 join 11日活跃用户明细表=每日留存用户表
(2)对每日留存用户表count就得到了10日新增用户在11日的留存用户
3)留存率计算
(1) 10日的新增用户 且 11日活跃的用户表 与10日新增用户join,算出留存用户
1)建表语句
hive (calllogs)> drop table if exists `dws_user_retention_day`; create table `dws_user_retention_day` ( sys_id string, service_name string, home_province_code string, visit_province_code string, channel_code string, service_code string, imsi string, msisdn string, `create_date` string comment '设备新增时间', `retention_day` int comment '截止当前日期留存天数' ) COMMENT '每日用户留存情况' PARTITIonED BY ( `dt` string) stored as parquet location '/warehouse/calllogs/dws/dws_user_retention_day/';
2)导入数据(每天计算前1天的新用户访问留存明细)
hive (calllogs)>
insert overwrite table dws_user_retention_day partition(dt="2021-12-07")
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-1);
3)查询导入数据(每天计算前1天的新用户访问留存明细)
hive (calllogs)> select count(*) from dws_user_retention_day;3.3.2 1,2,3,n天留存用户明细表
1)导入数据(每天计算前1,2,3,n天的新用户访问留存明细)
hive (calllogs)>
insert overwrite table dws_user_retention_day partition(dt="2021-12-07")
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-1)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
2 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-2)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
3 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-3);
2)查询导入数据(每天计算前1,2,3天的新用户访问留存明细)
hive (calllogs)> select retention_day, count(*) from dws_user_retention_day group by retention_day;3.3.3 1,2,3,n天留存用户明细数据加载脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_dws_user_retention.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".dws_user_retention_day partition(dt='$calllogs_date')
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-1)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
2 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-2)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
3 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-3);
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_dws_user_retention.sh
3)脚本使用
[hadoop@hadoop101 bin]$ calllogs_dws_user_retention.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select retention_day, count(*) from dws_user_retention_day group by retention_day;3.4 ADS层 3.4.1留存用户数
1)建表语句
hive (calllogs)> drop table if exists `ads_user_retention_day_count`; create table `ads_user_retention_day_count` ( `create_date` string comment '用户新增日期', `retention_day` int comment '截止当前日期留存天数', retention_count bigint comment '留存数量' ) COMMENT '每日用户留存情况' stored as parquet location '/warehouse/calllogs/ads/ads_user_retention_day_count/';
2)导入数据
hive (calllogs)> insert into table ads_user_retention_day_count select create_date, retention_day, count(*) retention_count from dws_user_retention_day where dt='2021-12-07' group by create_date,retention_day;
3)查询导入数据
hive (calllogs)> select * from ads_user_retention_day_count;3.4.2 留存用户比率
1)建表语句
hive (calllogs)> drop table if exists `ads_user_retention_day_rate`; create table `ads_user_retention_day_rate` ( `stat_date` string comment '统计日期', `create_date` string comment '用户新增日期', `retention_day` int comment '截止当前日期留存天数', `retention_count` bigint comment '留存数量', `new_user_count` string comment '当日用户新增数量', `retention_ratio` decimal(10,2) comment '留存率' ) COMMENT '每日用户留存情况' stored as parquet location '/warehouse/calllogs/ads/ads_user_retention_day_count/';
2)导入数据
hive (calllogs)>
insert into table ads_user_retention_day_rate
select
'2021-12-07' ,
ur.create_date,
ur.retention_day,
ur.retention_count ,
nuc.new_user_count,
ur.retention_count/nuc.new_user_count*100
from
(
select
create_date,
retention_day,
count(*) retention_count
from `dws_user_retention_day`
where dt='2021-12-07'
group by create_date,retention_day
) ur join ads_new_user_count nuc on nuc.create_date=ur.create_date;
3)查询导入数据
hive (calllogs)>select * from ads_user_retention_day_rate;



