(1)unix_timestamp() //获取当前时间戳,默认格式年月日时分秒
eg:select unix_timestamp('2021-09-26','yyyy-MM-dd'); //转成时间戳
(2)form_unixtime() //时间戳转换成时间,默认格式年月日时分秒,指定就给具体格式
eg:select from_unixtime(55489464);
(3)获取时间
year:获取年
select year('2021-09-27 20:50:00')
month:获取月
day:获取日
hour:获取时
minute:获取分
second:获取秒
(4)获取这是月多少天
eg:select dayofmonth('2021-11-35');
(5)月份加减
select add_months('2021-09-27',-3)
(6)日期相差datediff
eg:select datediff('2021-09-26','2021-09-27');
(7)格式化日期
eg:select date_format('2021-09-26','yyyy-MM-dd')
(8)时间加减
eg:select date_add('2021-09-26',6)
2、其他取整函数
(1)向上取整ceil select ceil(5.3); (2)四舍五入round select round(5.6); (3)向下取整floor select floor(3.9);3、其他字符串函数,条件函数,聚合函数
(1)转成大写upper
eg:select upper(‘sfasd’);
(2)转成小写lower
eg:select lower(‘FDSF’);
(3)获取长度length
eg:select length('fsdgsdgaesdgs');
(4)前后去空格trim
eg:select trim(' fesd ');
获取指定长度字符,多了就截取,少了就拿后面的补
(5)向左补齐,到指定长r度lpad
eg:select lpad('dcc',4,'fgdgsdf');
(6)向右补齐,到指定长度rpad
eg:select rpad('dcc',4,'fgdgsdf');
(7)regexp_replace 正则替换,由于日期必须是-,可以替换
eg:select regexp_replace('2021-09-29','-',':');
(8)if函数
eg:select *,if(y is null,0,y)from vels;
(9)case when 函数:case when 。。。end
eg:select *,case when y is null then 0 else y end from vels;
(10)concat,concat_ws,
拼接字符串
(11)split
聚合函数:sum,count,avg,min,max
select split(words,‘,’)from words;
(12)null值判断:is null ,is not null
(13)json数据的解析 get_json_object----$代表所有
数据:{"name":"zhangsan","age":18,"score":[{"course_name":"math","score":100},{"course_name":"english","score":60}]}
查询:select get_json_object(json,'$.score[1].course_name') from json;
4、集合操作
(1)size 返回集合 元素个数 (2)map_keys:返回map中key (3)map_values:返回map中的value (4)array_contains:判断是否包含某个元素 (5)sort_array:排序array中元素5、hive中的wordcount统计
使用split切分一行行数据,再使用explode函数将数据炸开
创建表:
create table words(
words string
)row format delimited fields terminated by '|';
数据:
hello,java,hello,java,scala,python
hbase,hadoop,hadoop,hdfs,hive,hive
hbase,hadoop,hadoop,hdfs,hive,hive
wordcount的计算:
select t.word,count(t.word) from (select explode(split(words,',')) as word from words) as t group by t.word
6、窗口函数rank()–后面跟着over一起使用
rank():排序相同时会重复,总数不会变 1 1 3 dense_rank()排序相同时会重复,总数会减少 1 1 2 row_number()会根据顺序计算 1 2 3 eg:select *,row_number()over(partition by clazz order by score desc) from new_score; percent_rank():(rank的结果-1)/(分区内数据的个数-1) cume_dist():小于等于当前行rank的个数/分区内总个数 分组求TopN: select * from (select *,row_number()over(partition by id order by scores desc) rk from score) as t1 where rk<=3 就是每个id为一个区,然后按成绩scores排序,分别开窗,每组编号,7、行列转换
使用lateral view,产生一个支持别名表的虚拟表,explode函数 参数仅接受array和map类型,不支持两个一起用
lists表中原始数据: www ["12","25","69"] zss ["36","54","96"] lss ["25","45","96"]行—>列
select * from lists lateral view explode(score)t as c; //explode先炸开,查询出类似一个视图,联合,t是查询生成视图表的表名,c是生成视图表的一个字段(行转列就只有一个字段) 结果: www ["12","25","69"] 12 www ["12","25","69"] 25 www ["12","25","69"] 69 zss ["36","54","96"] 36 zss ["36","54","96"] 54 zss ["36","54","96"] 96 lss ["25","45","96"] 25 lss ["25","45","96"] 45 lss ["25","45","96"] 96 转map: select * from maps lateral view explode(score) t as x,y; //map是kv的,转换成列有2个字段,需要给2个列—>行:collect_list()
表中原始数据: www 12 www 25 www 69 zss 36 zss 54 zss 96 lss 25 lss 45 lss 96 使用上面的生成的表:
select tt.name,collect_list(tt.c) from (select name,c from lists lateral view explo de(score) t as c)as tt group by tt.name; //需要使用group by 分组 结果: lss ["25","45","96"] www ["12","25","69"] zss ["36","54","96"]8、开窗函数over()
再使用聚合函数时,既要最后的结果,又要中间的结果,使用开窗函数,通过开窗函数可以看到聚合函数的执行过程(配合聚合函数一起使用)
over(partition by clazz) //不用group by 直接使用partition进行分组,其实就是将数据按照partition指定的字段分开处理
1)查询操作: 窗口帧:用于从分区中选择指定的多条记录,供窗口函数处理 over():指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变化而变化 current row :当前行 n preceding:往前n行数据 n following:往后n行数据 unbounded:起点 unbounded preceding 表示从前面的起点 unbounded following 表示到后面的终点 lag(col, n,defalut_val):往前第n行数据 lead(col, n,default_val):往后第n行数据 ntile(n):将数据排序后分成n个组,打上组的编号,方便求前xx%的数据 select *,t.c from(select *,ntile(5)over(order by score desc)as c from new_score)as t where t.c<=1; first_value:取分组内排序后,截止到当前行,第一个值 last_value:取分组内排序后,截止到当前行,最后一个值,对于并列的排名,取最后一个 rows标识行,range表述数据的在整个partition范围内的数据范围 rows必须跟在order by子句之后 over()开窗函数:分组,将前面的聚合函数传给over(),没有参数,就是所有,每一个name都有自己的组,就是表示当前开窗的窗口的大小范围 两个小技巧: 查询使用多个开窗函数over()时可以使用变量代替:window ...as... select * ,max(age) over x ,min(age) over x ,sum(age) over x from students window x as (partition by clazz order by age desc rows between 2 preceding and 2 following) 关联多张表的数据:with...as with s1 as (select * from score), s2 as (select * from students) select * from s1 left join s2 on s1.id=s2.id9、例题理解1
1)创建表
hive> create table business(name string,orderdate string,cost int)
> row format delimited fields terminated by ',';
2)上传表数据
dfs,2021-09-15,10 tgh,2021-09-14,26 itye,2021-08-12,15 dvc,2021-09-24,36 uyr,2021-04-03,65 oitt,2021-05-14,87 fkgi,2021-09-13,45 kifg,2021-09-05,100 kirif,2021-03-25,35 yfre,2021-03-14,64 fitut,2021-04-15,14 k9hi,2021-08-17,11 fgre,2021-03-18,64 hjtut,2021-04-19,14 hjhi,2021-08-23,11
查询在2019年9月份购买过的顾客及总人数 (使用substring取日期)
select name,count(*) over() from business where substring(orderdate,0,7)='2021-09' group by name;查询顾客的购买明细及其购买总额
over()开窗函数传入参数,就是按指定参数分组 select name,orderdate,cost,sum(cost) over(partition by name) from business;
查询顾客的购买明细及其月购买总额
select name,orderdate,cost,sum(cost) over(partition by name,month(orderdate)) from business;
上述场景,将每个 顾客的cost按照日期进行累加
指明窗口到当前行
select name,orderdate,cost,sum(cost) over(partition by name order by orderdate) from business
按人分开,cost相加可以指明到当前行,每个人的前一行和后一行cost相加
select name,orderdate,cost,sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and 1 FOLLOWING) from business ;
查询每个顾客上次的购买时间
select name,orderdate,cost,lag(orderdate,1) over(partition by name order by orderdate) from business;
查询前20%时间的订单信息
select name,orderdate,cost,ntile(5) over(order by orderdate) groupId from business; //t1表 select name,orderdate,cost from t1 where groupId=1; 整合后:20%就是分5份 select name,orderdate,cost from (select name,orderdate,cost,ntile(5) over(order by orderdate) groupId from business) t1 where groupId=1;10、例题理解2 range(区域)和rows(行)
1)建表
create table new_score(
id int
,score int
,clazz string
,department string
) row format delimited fields terminated by ",";
2)数据
224 80 class2 department2
225 85 class2 department2
.......
3)窗口帧格式
注意:
UNBOUNDED:无界限
CURRENT ROW:当前行
rows格式:按照行的记录取值(前2行后2行)
select *,max(score)over(partition by clazz,order by score rows between 2 preceding and
2 following )from new_score
range格式: 如果当前值在80,取值就会落在范围在80-2=78和80+2=82组件之内的行(全区partition)
select *,max(score)over(partition by clazz,order by score)range between 2 preceding and 2 following from new_score
总结就是:rows就是决定开窗的行数,range就是开窗数据的范围
select id
,score
,clazz
,department
,lag(id,2) over (partition by clazz order by score desc) as lag_num
,LEAD(id,2) over (partition by clazz order by score desc) as lead_num
,FIRST_VALUE(id) over (partition by clazz order by score desc) as first_v_num
,LAST_VALUE(id) over (partition by clazz order by score desc) as last_v_num
,NTILE(3) over (partition by clazz order by score desc) as ntile_num
from new_score;
id score clazz department lag_num lead_num first_v_num last_v_num ntile_num
114 94 class1 department1 NULL 213 114 213 1
214 94 class1 department2 NULL 211 114 213 1
213 94 class1 department2 114 115 114 213 1
211 93 class1 department2 214 212 114 115 1
115 93 class1 department1 213 215 114 115 2
212 83 class1 department2 211 112 114 212 2
215 82 class1 department2 115 113 114 215 2
112 80 class1 department1 212 216 114 112 2
113 74 class1 department1 215 111 114 216 3
216 74 class1 department2 112 NULL 114 216 3
111 69 class1 department1 113 NULL 114 111 3
221 99 class2 department2 NULL 225 221 221 1
122 86 class2 department1 NULL 224 221 122 1
225 85 class2 department2 221 123 221 225 1
224 80 class2 department2 122 222 221 224 2
123 78 class2 department1 225 121 221 222 2
222 78 class2 department2 224 223 221 222 2
121 74 class2 department1 123 124 221 223 3
223 74 class2 department2 222 NULL 221 223 3
124 70 class2 department1 121 NULL 221 124 3



