栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 前沿技术 > 大数据 > 数据挖掘与分析

hive sql 计算网页用户留存率和AB实验显著性判断

hive sql 计算网页用户留存率和AB实验显著性判断

计算用户留存率: 怎么计算留存率?


sql代码(HUE上运行):
SELECt
      a.dt1 `创作日期`,
      a.cnt `创作用户量`,
      concat((round((a.cnt1/a.cnt)*100, 2)), '%') `次日首页留存率`,
      concat((round((a.cnt2/a.cnt)*100, 2)), '%') `3日首页留存率`,
      concat((round((a.cnt3/a.cnt)*100, 2)), '%') `7日首页留存率`
from (
SELECt t1.dt1,
      count(distinct if(datediff(t2.dt2,t1.dt1)=0,t2.device_id,null)) cnt,
       count(distinct if(datediff(t2.dt2,t1.dt1)=1,t2.device_id,null)) cnt1,
         count(distinct if(datediff(t2.dt2,t1.dt1)=3,t2.device_id,null)) cnt2,
           count(distinct if(datediff(t2.dt2,t1.dt1)=7,t2.device_id,null)) cnt3
      
FROM
  (SELECt dt dt1,
          device_id
  FROM dws_zdm.dws_zdm_hive_traffic_home_feed_h
 WHERe dt BETWEEN '${start_date}' AND'${end_date}'---时间可筛选 一个个来 '2021-03-01' '2021-03-02' '2021-03-03' '2021-03-04' '2021-03-05'
--where dt ='${start_date}'
GROUP BY dt1,
         device_id)t1
LEFT JOIN
  (SELECt dt dt2,
          device_id
  FROM dws_zdm.dws_zdm_hive_traffic_home_feed_h
  WHERe dt >= '${start_date}'
  GROUP BY dt2,
            device_id)t2
ON t1.device_id = t2.device_id
WHERe t2.dt2 >= t1.dt1
and ( datediff(t2.dt2,t1.dt1)=0 or  datediff(t2.dt2,t1.dt1)=1 or  datediff(t2.dt2,t1.dt1)=3 or  datediff(t2.dt2,t1.dt1)=7 )
GROUP BY t1.dt1
ORDER BY t1.dt1)a
数据显示

AB实验显著性判断
-- 计算一段时间内的用户留存率
cache table tmp_1
SELECt
      a.dt1 dt,
      a.user_type  user_type,
      round((a.cnt1/a.cnt)*100, 2) liucun1,
      round((a.cnt2/a.cnt)*100, 2) liucun3,
      round((a.cnt7/a.cnt)*100, 2) liucun7
from (
SELECt t1.dt1,
       case when t1.abtest regexp '${shiyan_value}' then '实验组'
       when t1.abtest regexp '${duizhao_value}' then '对照组'
     end user_type,
      count(distinct if(datediff(t2.dt2,t1.dt1)=0,t2.device_id,null)) cnt,
       count(distinct if(datediff(t2.dt2,t1.dt1)=1,t2.device_id,null)) cnt1,
         count(distinct if(datediff(t2.dt2,t1.dt1)=3,t2.device_id,null)) cnt2,
           count(distinct if(datediff(t2.dt2,t1.dt1)=7,t2.device_id,null)) cnt7
      
FROM
  (SELECt dt dt1,
  regexp_extract(abtest_collection,'(^55|,55)_([^,]+)',2) abtest,
          device_id
  FROM dws_zdm.dws_zdm_hive_traffic_home_feed_h
 WHERe dt BETWEEN '${start_date}' AND'${end_date}'
 and project_id = '2'
and expose >0
GROUP BY 1,2,3)t1
LEFT JOIN
  (SELECt dt dt2,
       device_id
  FROM dws_zdm.dws_zdm_hive_traffic_home_feed_h
  WHERe dt >= '${start_date}'
  and project_id = '2'
and expose >0
  GROUP BY 1,2)t2
ON t1.device_id = t2.device_id
WHERe t2.dt2 >= t1.dt1
and ( datediff(t2.dt2,t1.dt1)=0 or  datediff(t2.dt2,t1.dt1)=1 or  datediff(t2.dt2,t1.dt1)=3 or  datediff(t2.dt2,t1.dt1)=7 )
GROUP BY 1,2
ORDER BY 1,2)a
where a.user_type is not null 


-- 计算均值和标准差
cache table tmp_2
SELECt user_type,
       count(user_type) sample_num,
       avg(liucun1) liucun1_avg,
       avg(liucun3) liucun3_avg,
       avg(liucun7) liucun7_avg,
       stddev(liucun1) liucun1_std,
       stddev(liucun3) liucun3_std,
       stddev(liucun7) liucun7_std
from tmp_1
group by 1


-- 假设检验
select  sample_num_shiyan `实验组样本量`,
        sample_num_duizhao `对照组样本量`,
       liucun1_avg_shiyan `实验组日均次日留存率`,
       liucun1_avg_duizhao `对照组日均次日留存率`,
        (liucun1_avg_shiyan/liucun1_avg_duizhao)-1 `日均次日留存率变化度`,
        if(abs(liucun1_avg_shiyan-liucun1_avg_duizhao)/
        sqrt(liucun1_std_shiyan*liucun1_std_shiyan/sample_num_shiyan+
        liucun1_std_duizhao*liucun1_std_duizhao/sample_num_duizhao)>1.96,'显著','不显著') `日均次日留存率显著性`,

        liucun3_avg_shiyan `实验组日均3日留存率`,
        liucun3_avg_duizhao `对照组日均3日留存率`,
        (liucun3_avg_shiyan/liucun3_avg_duizhao)-1 `日均3日留存率变化度`,
        if(abs(liucun3_avg_shiyan-liucun3_avg_duizhao)/
        sqrt(liucun3_std_shiyan*liucun3_std_shiyan/sample_num_shiyan+
        liucun3_std_duizhao*liucun3_std_duizhao/sample_num_duizhao)>1.96,'显著','不显著') `日均3日留存率显著性`,

        liucun7_avg_shiyan `实验组日均7日留存率`,
        liucun7_avg_duizhao `对照组日均7日留存率`,
        (liucun7_avg_shiyan/liucun7_avg_duizhao)-1 `日均7日留存率变化度`,
        if(abs(liucun7_avg_shiyan-liucun7_avg_duizhao)/
        sqrt(liucun7_std_shiyan*liucun7_std_shiyan/sample_num_shiyan+
        liucun7_std_duizhao*liucun7_std_duizhao/sample_num_duizhao)>1.96,'显著','不显著') `日均7日留存率显著性`
      
from 
(
select sum(if(user_type='实验组',sample_num,0))  sample_num_shiyan,	
       sum(if(user_type='对照组',sample_num,0))  sample_num_duizhao,	
       sum(if(user_type='实验组',liucun1_avg,0))  liucun1_avg_shiyan,	
       sum(if(user_type='对照组',liucun1_avg,0))  liucun1_avg_duizhao,	
       sum(if(user_type='实验组',liucun1_std,0))  liucun1_std_shiyan,	
       sum(if(user_type='对照组',liucun1_std,0))  liucun1_std_duizhao,	
       sum(if(user_type='实验组',liucun3_avg,0))  liucun3_avg_shiyan,
       sum(if(user_type='对照组',liucun3_avg,0))  liucun3_avg_duizhao,	
       sum(if(user_type='实验组',liucun3_std,0))  liucun3_std_shiyan,	
       sum(if(user_type='对照组',liucun3_std,0))  liucun3_std_duizhao,
       sum(if(user_type='实验组',liucun7_avg,0))  liucun7_avg_shiyan,	
       sum(if(user_type='对照组',liucun7_avg,0))  liucun7_avg_duizhao,	
       sum(if(user_type='实验组',liucun7_std,0))  liucun7_std_shiyan,	
       sum(if(user_type='对照组',liucun7_std,0))  liucun7_std_duizhao
from   tmp_2
)t
数据显示

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/278749.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号