时间:20220122
preface:印象笔记,不是永恒的,博客也不是,但多备份一个平台,总是好的。
min、max:列最小最大
least、greatest:行(多个字段)的最小最大
-- 获取多列最大最小值
SELECt GREATEST(1,2,3,4); 结果:4
SELECT LEAST(1,2,3,4); 结果:1
-- 获取一列中最大最小值
SELECT MAX(col1);
SELECT MIN(col1);
类型转换
转为ascii
hex、unhex、decode:Select decode(unhex(hex(65)), 'US-ASCII');
参考:Convert Ascii value to Character in hive
encode(string src, string charset):
使用指定的字符集charset将字符串编码成二进制值,支持的字符集有:'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16',如果任一输入参数为NULL都将返回NULL
ascii(‘A’):将字符串中首字母转为数字
time相关
from_unixtime:
unixtime转为str_time
select from_unixtime(cast(1639817905488/1000 as bigint));
select from_unixtime(1639784618);
unix_timestamp:str_time+格式,转为unixtime。
select from_unixtime(1639784618), unix_timestamp('2021-12-18 07:43:38');
Where request_timestamp>unix_timestamp('2021-12-21 23:00:00')*1000
substr
参数:string,start(从1开始),len
select substr('2021-12-23 17:59:00', 1, 10);
2021-12-23
lifecycle:
drop table if exists xxx;
create table if not exists xxx () lifecycle 30 as
Select xxx from xx where xx;
丢掉生命周期:alter table xxxx disable lifecycle;
row_number
row_number() over() as id:生成唯一id
ROW_NUMBER() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3
两种及以上函数组合
like + concat:一个字段是否包含另外一个字段
where food_name not like concat('%', keyword, '%')
字符串拆分split
split(str, 正则表达式)
返回数组
regexp_extract:正则匹配
语法:regexp_extract(string subject,string pattern,int index)
index为
0:表示把匹配的所有的都取出来
1:表示把匹配的第一个取出来
抽取中文:
select regexp_replace('2019这里china', '[u4e00-u9fa5]+', "");
例子:
regexp_extract('IloveYou','I(.*?)(You)',1)
select regexp_extract("#35484#14_18537#6829#35137#15_18537#6918#", "#6829#35137#([0-9]+)_", 1) as c1, regexp_extract('IloveYou','I(.*?)(You)',1) as c2;
regexp_extract(tpp_bucket, "#6829#([0-9]+)#([0-9]+)_", 0) as n_bucket
select regexp_extract("#19_18537#0#243995#16_18537#", "18537#0#([0-9]+)#([0-9]+)_", 2) as c1;
不可:
没懂为何不可,或操作不行?:
select regexp_extract('_18537#0#252249#17_18537#6867#33025#3_18537#7318', "18537#0#252249#([13|17])_", 1) as tmp;
COALESCE(REGEXP_EXTRACT(tolower(args),'rank_id(":"|=)([\da-z]+)',2),'') AS rank_id,
例子:hive中regexp_extract的用法总结_Accepted ALL lve-CSDN博客_regexp_extract()
regexp_replace正则替换
语法:regexp_replace(string A,string B,string C)
字符串A中的字符串B, 替换成字符串C
COALESCE(REGEXP_EXTRACT(tolower(args),'xxx(":"|=)([\da-z]+)',2),'') AS xxx
array_contains
group by
select userId, GROUP_CONCAt(siteId) siteIds
from xxx
group by xxx
Rank
RANK() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3
mod:
取余:select mod(29, 3)
like
if(xxx like "%#6829#35137#%", 1, 0) as isExp
rlike
from xxx and (xx REGEXP ".*#6829#35137#[11|12|13|14|15].*" or xxx REGEXP ".*#6829#32836#[1|2|3|4|5].*")
dt、ds
where ds=to_char(dateadd(to_date('${bizdate}','yyyymmdd'),-2,'dd'),'yyyymmdd')
where ds=MAX_PT('xxxxx')
分布式函数
PERCENT_RANK()
CUME_DIST()
文档:mysql 分布函数 PERCENT_RANK()、CUME_DIST()
Explode
LATERAL VIEW explode
select query, col from xxx LATERAL VIEW explode(split(predict," “)) t AS col where query='奶茶';
LATERAL VIEW posexplode:带有index的explode
select query, idx, col from xxx LATERAL VIEW posexplode(split(predict," ")) t AS idx, col where query='奶茶';
参考:算法人必懂的Hive知识-四道Hive面试&笔试题解析
LATERAL VIEW OUTER explode
为空,依然返回
json、array、map:复杂格式
get_json_object()
get_json_object(tag, '$.category')
get_json_object(被解析字段A,"$.[0].name") column_name
json_tuple
lateral view json_tuple(ss.col,'itemRateId','name','rating') rr as itemRateId,name,rating
{"itemRateId":"81980290","name":"需要餐具","rating":2,"ratingContent":""}
array
Array
Array
select explode(array('A','B','C'));
select tf.* from (select 0) t lateral view explode(array('A','B','C')) tf as col1;
map
str_to_map
select explode(map('A',10,'B',20,'C',30)) as (my_key,my_value);
select tf.* from (select 0) t lateral view explode(map('A',10,'B',20,'C',30)) tf as my_key,my_value;
json+array
单纯的get_json_object、json_tuple不可破,需要加上regexp_replace
不确定json里的数组多长,使用get_json_object(tag, '$.category[0]') as c不可
参考:hive如何解析json数组、如何在 Apache Hive 中解析 Json 数组
代码:
select json_tuple(json, 'website', 'name') as (website, name)
from
(
select explode(
split(
regexp_replace(
regexp_replace(
'[{"website":"https://blog.csdn.net/lz6363","name":"浮云"},{"website":"https://blog.csdn.net/lz6363/article/details/86606379","name":"hive中的开窗函数"},{"website":"https://blog.csdn.net/lz6363/article/details/82470278","name":"利用hive统计连续登陆的用户数"}]', '\}\,\{','};{'
),'\[|\]',''
),'\;'
)
) as json
) fuyun;
json+子json+array
{"entities": [{"entity_info": {}, "terms": [{"synonym": [], "tag": "食材", "word": "肉沫"}, {"synonym": [], "tag": "食材", "word": "茄子"}, {"synonym": [], "tag": "主体", "word": "饭"}], "entity_name": "肉沫茄子饭", "entity_type": "Food"}]}
+---------+-----+------+
| synonym | tag | word |
+---------+-----+------+
| 0 | 食材 | 肉沫 |
| 0 | 食材 | 茄子 |
| 0 | 主体 | 饭 |
+---------+-----+------+
select json_tuple(json, 'synonym', 'tag', 'word') as (synonym, tag, word)
from
(
select explode(
split(
regexp_replace(
regexp_replace(
regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
),'\[|\]',''
),'\;'
)
) as json
from
(
select concat('[{"',
regexp_extract(
regexp_replace(
regexp_replace(item_understanding, '\[\{\"', '-----')
, '\"\}\]', '_____')
, '-----(synonym.*?)_____', 1)
, '"}]') as _array
from xxx
)
);
+------------+-----+------+
| food_id | tag | word |
+------------+-----+------+
| 1342782581 | 食材 | 肉沫 |
| 1342782581 | 食材 | 茄子 |
| 1342782581 | 主体 | 饭 |
+------------+-----+———+
select food_id
, get_json_object(json, '$.tag') as tag
, get_json_object(json, '$.word') as word
from
(
select food_id, json
from
(
select food_id, concat('[{"',
regexp_extract(
regexp_replace(
regexp_replace(item_understanding, '\[\{\"', '-----')
, '\"\}\]', '_____')
, '-----(synonym.*?)_____', 1)
, '"}]') as _array
from xxx
)a
LATERAL VIEW explode(
split(
regexp_replace(
regexp_replace(
regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
),'\[|\]',''
),'\;'
)
)t as json
);
UDTF
LATERAL VIEW udtf(expression) tableAlias AS columnAlias (',' columnAlias)*
聚合函数:
局部变量
set a=1;
博客:MySQL变量的使用 - Brambling - 博客园
select pagecityid, keyword,
count(DISTINCT queryid) as keywordTimes,
count(DISTINCT case when regexp_extract(reserves["record_list"], ":slotId=(.*?):",1) ="10011" then queryid end) as keywordAsAdsWordTimes
from xxxx
where hp_cal_dt BETWEEN "2017-11-01" and "2017-11-07"
group by 1,2
having keywordAsAdsWordTimes>=350
参考:Hive中常见的字符串函数:Hive常用字符串函数 – 过往记忆



