hive——常用_大数据系统

时间：20220122

preface：印象笔记，不是永恒的，博客也不是，但多备份一个平台，总是好的。

min、max：列最小最大

least、greatest：行（多个字段）的最小最大

-- 获取多列最大最小值

SELECt GREATEST(1,2,3,4); 结果：4

SELECT LEAST(1,2,3,4); 结果：1

-- 获取一列中最大最小值

SELECT MAX(col1);

SELECT MIN(col1);

类型转换

转为ascii

hex、unhex、decode：Select decode(unhex(hex(65)), 'US-ASCII');

参考：Convert Ascii value to Character in hive

encode(string src, string charset)：

使用指定的字符集charset将字符串编码成二进制值，支持的字符集有：'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'，如果任一输入参数为NULL都将返回NULL

ascii(‘A’)：将字符串中首字母转为数字

time相关

from_unixtime：

unixtime转为str_time

select from_unixtime(cast(1639817905488/1000 as bigint));

select from_unixtime(1639784618);

unix_timestamp：str_time+格式，转为unixtime。

select from_unixtime(1639784618), unix_timestamp('2021-12-18 07:43:38');

Where request_timestamp>unix_timestamp('2021-12-21 23:00:00')*1000

substr

参数：string，start（从1开始），len

select substr('2021-12-23 17:59:00', 1, 10);

2021-12-23

lifecycle：

drop table if exists xxx;

create table if not exists xxx () lifecycle 30 as

Select xxx from xx where xx;

丢掉生命周期：alter table xxxx disable lifecycle;

row_number

row_number() over() as id：生成唯一id

ROW_NUMBER() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3

两种及以上函数组合

like + concat：一个字段是否包含另外一个字段

where food_name not like concat('%', keyword, '%')

字符串拆分split

split(str, 正则表达式)

返回数组

regexp_extract：正则匹配

语法：regexp_extract（string subject，string pattern，int index)

index为

0：表示把匹配的所有的都取出来

1:表示把匹配的第一个取出来

抽取中文：

select regexp_replace('2019这里china', '[u4e00-u9fa5]+', "");

例子：

regexp_extract('IloveYou','I(.*?)(You)',1)

select regexp_extract("#35484#14_18537#6829#35137#15_18537#6918#", "#6829#35137#([0-9]+)_", 1) as c1, regexp_extract('IloveYou','I(.*?)(You)',1) as c2;

regexp_extract(tpp_bucket, "#6829#([0-9]+)#([0-9]+)_", 0) as n_bucket

select regexp_extract("#19_18537#0#243995#16_18537#", "18537#0#([0-9]+)#([0-9]+)_", 2) as c1;

不可：

没懂为何不可，或操作不行？：

select regexp_extract('_18537#0#252249#17_18537#6867#33025#3_18537#7318', "18537#0#252249#([13|17])_", 1) as tmp;

COALESCE(REGEXP_EXTRACT(tolower(args),'rank_id(":"|=)([\da-z]+)',2),'') AS rank_id,

例子：hive中regexp_extract的用法总结_Accepted ALL lve-CSDN博客_regexp_extract()

regexp_replace正则替换

语法：regexp_replace（string A，string B，string C）

字符串A中的字符串B, 替换成字符串C

COALESCE(REGEXP_EXTRACT(tolower(args),'xxx(":"|=)([\da-z]+)',2),'') AS xxx

array_contains

group by

select userId, GROUP_CONCAt(siteId) siteIds

from xxx

group by xxx

Rank

RANK() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3

mod：

取余：select mod(29, 3)

if(xxx like "%#6829#35137#%", 1, 0) as isExp

rlike

from xxx and (xx REGEXP ".*#6829#35137#[11|12|13|14|15].*" or xxx REGEXP ".*#6829#32836#[1|2|3|4|5].*")

dt、ds

where ds=to_char(dateadd(to_date('${bizdate}','yyyymmdd'),-2,'dd'),'yyyymmdd')

where ds=MAX_PT('xxxxx')

分布式函数

PERCENT_RANK()

CUME_DIST()

文档：mysql 分布函数 PERCENT_RANK()、CUME_DIST()

Explode

LATERAL VIEW explode

select query, col from xxx LATERAL VIEW explode(split(predict," “)) t AS col where query='奶茶';

LATERAL VIEW posexplode：带有index的explode

select query, idx, col from xxx LATERAL VIEW posexplode(split(predict," ")) t AS idx, col where query='奶茶';

参考：算法人必懂的Hive知识-四道Hive面试&笔试题解析

LATERAL VIEW OUTER explode

为空，依然返回

json、array、map：复杂格式

get_json_object()

get_json_object(tag, '$.category')

get_json_object(被解析字段A,"$.[0].name") column_name

json_tuple

lateral view json_tuple(ss.col,'itemRateId','name','rating') rr as itemRateId,name,rating

{"itemRateId":"81980290","name":"需要餐具","rating":2,"ratingContent":""}

array

Array

select explode(array('A','B','C'));

select tf.* from (select 0) t lateral view explode(array('A','B','C')) tf as col1;

map

str_to_map

select explode(map('A',10,'B',20,'C',30)) as (my_key,my_value);

select tf.* from (select 0) t lateral view explode(map('A',10,'B',20,'C',30)) tf as my_key,my_value;

json+array

单纯的get_json_object、json_tuple不可破，需要加上regexp_replace

不确定json里的数组多长，使用get_json_object(tag, '$.category[0]') as c不可

参考：hive如何解析json数组、如何在 Apache Hive 中解析 Json 数组

代码：

select json_tuple(json, 'website', 'name') as (website, name)
from 
(
    select explode(
        split(
            regexp_replace(
                regexp_replace(
                '[{"website":"https://blog.csdn.net/lz6363","name":"浮云"},{"website":"https://blog.csdn.net/lz6363/article/details/86606379","name":"hive中的开窗函数"},{"website":"https://blog.csdn.net/lz6363/article/details/82470278","name":"利用hive统计连续登陆的用户数"}]', '\}\,\{','};{'
                ),'\[|\]',''
            ),'\;'
        )
    ) as json
) fuyun;

json+子json+array

{"entities": [{"entity_info": {}, "terms": [{"synonym": [], "tag": "食材", "word": "肉沫"}, {"synonym": [], "tag": "食材", "word": "茄子"}, {"synonym": [], "tag": "主体", "word": "饭"}], "entity_name": "肉沫茄子饭", "entity_type": "Food"}]}
+---------+-----+------+
| synonym | tag | word |
+---------+-----+------+
| 0       | 食材 | 肉沫 |
| 0       | 食材 | 茄子 |
| 0       | 主体 | 饭  |
+---------+-----+------+

select json_tuple(json, 'synonym', 'tag', 'word') as (synonym, tag, word)
from 
(
    select explode(
            split(
                regexp_replace(
                    regexp_replace(
                    regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
                    ),'\[|\]',''
                ),'\;'
            )
        ) as json
    from
    (
        select concat('[{"',
            regexp_extract(
                regexp_replace(
                    regexp_replace(item_understanding, '\[\{\"', '-----')
                , '\"\}\]', '_____')
            , '-----(synonym.*?)_____', 1) 
        , '"}]') as _array
        from xxx
    )
);

+------------+-----+------+

| food_id | tag | word |

+------------+-----+------+

| 1342782581 | 食材 | 肉沫 |

| 1342782581 | 食材 | 茄子 |

| 1342782581 | 主体 | 饭 |

+------------+-----+———+

select food_id
    , get_json_object(json, '$.tag') as tag
    , get_json_object(json, '$.word') as word
from 
(
    select food_id, json
    from
    (
        select food_id, concat('[{"',
            regexp_extract(
                regexp_replace(
                    regexp_replace(item_understanding, '\[\{\"', '-----')
                , '\"\}\]', '_____')
            , '-----(synonym.*?)_____', 1) 
        , '"}]') as _array
        from xxx 
    )a
    LATERAL VIEW explode(
            split(
                regexp_replace(
                    regexp_replace(
                    regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
                    ),'\[|\]',''
                ),'\;'
            )
        )t as json
);

UDTF

LATERAL VIEW udtf(expression) tableAlias AS columnAlias (',' columnAlias)*

聚合函数：

局部变量

set a=1;

博客:MySQL变量的使用 - Brambling - 博客园

select pagecityid, keyword,

count(DISTINCT queryid) as keywordTimes,

count(DISTINCT case when regexp_extract(reserves["record_list"], ":slotId=(.*?):",1) ="10011" then queryid end) as keywordAsAdsWordTimes

from xxxx

where hp_cal_dt BETWEEN "2017-11-01" and "2017-11-07"

group by 1,2

having keywordAsAdsWordTimes>=350

参考：Hive中常见的字符串函数：Hive常用字符串函数 – 过往记忆

hive——常用

大数据系统相关栏目本月热门文章