(1).用户资料表(mp_user) 字段 字段描述 字段类型 user_id 用户ID String user_name 用户姓名 String user_status 用户状态 int area_id 地区ID int device_number 手机号码 String cust_id 客户ID String cust_sex 客户性别 String cust_addr 客户地址 String (2).地区ID对应的地区名称表(mp_area) 字段 字段描述 字段类型 area_id 地区ID int area_name 地区名称 String (3)流量详单表(mp_gprs) 字段 字段描述 字段类型 device_number 手机号码 String lac 位置区编码 String ci CI号码 String imei 终端类型 String service_type 流量类型 String start_time 开始时间 String end_time 结束时间 String duration 时长(秒) Int send_bytes 上行流量(MB) decimal(20,3) recv_bytes 下行流量(MB) decimal(20,3)数据举例
mp_use.txt 用户资料表
883090|钱七|1|4|13074737671|37867|男|威海
mp_area.txt 地区名称表
0|北京
mp_gprs.txt 流量详单表
13027810045|59146|10742|013717002416820|912|2017-02-12 20:40:33|2017-02-12 23:12:20|1| 74|287
create external table mp_user(
user_id string,
user_name string,
user_status int,
area_id int,
device_number string,
cust_id string,
cust_sex string,
cust_addr string)
row format delimited fields terminated by '|';
create external table mp_area(area_id int,area_name string)
row format delimited fields terminated by '|';
create external table mp_gprs(
device_number string,
lac string,
ci string,
imei string,
service_type string,
start_time string,
end_time string,
duration int,
send_bytes decimal(20,3),
recv_bytes decimal(20,3)
)row format delimited fields terminated by '|';
加载数据
load data local inpath '/usr/local/data/mp_use.txt' into table mp_user; load data local inpath '/usr/local/data/mp_area.txt' into table mp_area; load data local inpath '/usr/local/data/mp_gprs.txt' into table mp_gprs;统计各用户使用通话时长合计,按通话时长合计降序排列
输出的字段 包含 通话时长计算方式 结束时间-开始时间 字段 字段描述 字段类型 user_name 用户姓名 String user_status 用户状态 int cust_sex 客户性别 String area_name 地区名称 String device_number 手机号码 String ci CI号码 String total_time 通话时长(秒) Int
解答
关于这里的解答,mp_gprs中的字段含义,开始时间和结束时间,时长,我不确定应该用哪个,如果说是时长duration字段,每次通话时长就一两秒就很奇怪。但如果是用开始时间和结束时间来算通话时长,有感觉duration这个字段没什么用了。所以我干脆两种方式的解答都写了,看到这篇博客的朋友可以根据自己的理解来做,我的答案仅作参考
//首先连接user表和area表,并且提取相应字段,结果放入t1表中 select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id; with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id)
//然后改变一下gprs表,将里面的starttime和endtime改为unix时间戳格式,结果放入t2 select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs; with t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs)
//两表连接求最终结果 select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number;
//汇总 with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id), t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs) select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number;
像上面这样求出来的数据,会有大量的重复行,因此还需要一步合并
//合并 with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id), t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs) select * from ( select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number) as t group by user_name,user_status,cust_sex,area_name,device_number,ci,total_time;
//排序 with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id), t2 as (select device_number,ci,unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss') as s_time, unix_timestamp(end_time,'yyyy-MM-dd HH:mm:ss') as e_time from mp_gprs) select * from ( select t1.*,ci,(e_time-s_time) as total_time from t1 join t2 on t1.device_number=t2.device_number) as t group by user_name,user_status,cust_sex,area_name,device_number,ci,total_time order by total_time desc;
最终结果为
//首先连接user表和area表,并且提取相应字段,结果放入t1表中 select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id; with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id)
//对mp_gprs表求出每个手机号码的时长总和,结果放入t2表中 select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci; with t2 as (select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci)
//连接t1表和t2表,获取相应的字段信息 select t1.*,ci,sum from t1 join mp_gprs on t1.device_number=mp_gprs.device_number
//汇总 with t1 as (select user_name,user_status,cust_sex,area_name,device_number from mp_user join mp_area on mp_user.area_id=mp_area.area_id), t2 as (select device_number,ci,sum(duration) as sum from mp_gprs group by device_number,ci) select t1.*,ci,sum from t1 join t2 on t1.device_number=t2.device_number order by sum desc;
结果为
输出的字段 包含 用户姓名,时间(天),总流量
//首先获取每个手机号码的日期和每天总流量 select * from (select device_number,substring(start_time,0,10) as sub,(send_bytes+recv_bytes) as sum from mp_gprs) as t group by device_number,sub,sum;
//考虑到上行流量与下行流量可能为null的情况,替换为0 select * from (select device_number,substring(start_time,0,10) as use_day,(if(send_bytes is null,0,send_bytes)+if(recv_bytes is null,0,recv_bytes)) as total_bytes from mp_gprs) as t group by device_number,use_day,total_bytes;
//然后连接user表,获取用户姓名,顺便加上order by排序子句,然后用limit求取前五名 with t1 as (select device_number,use_day,total_bytes from (select device_number,substring(start_time,0,10) as use_day,(if(send_bytes is null,0,send_bytes)+if(recv_bytes is null,0,recv_bytes)) as total_bytes from mp_gprs) as t group by device_number,use_day,total_bytes) select user_name,use_day,total_bytes from t1 join mp_user on t1.device_number=mp_user.device_number order by total_bytes desc limit 5;
最终结果为



