栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 前沿技术 > 大数据 > 大数据系统

hive——常用

hive——常用

时间:20220122

preface:印象笔记,不是永恒的,博客也不是,但多备份一个平台,总是好的。

min、max:列最小最大

least、greatest:行(多个字段)的最小最大

-- 获取多列最大最小值

SELECt GREATEST(1,2,3,4);   结果:4

SELECT LEAST(1,2,3,4);   结果:1

-- 获取一列中最大最小值

SELECT MAX(col1);  

SELECT MIN(col1);

类型转换

转为ascii

hex、unhex、decode:Select decode(unhex(hex(65)), 'US-ASCII');

参考:Convert Ascii value to Character in hive

encode(string src, string charset):

使用指定的字符集charset将字符串编码成二进制值,支持的字符集有:'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16',如果任一输入参数为NULL都将返回NULL

ascii(‘A’):将字符串中首字母转为数字

time相关

from_unixtime:

unixtime转为str_time

select from_unixtime(cast(1639817905488/1000 as bigint));

select from_unixtime(1639784618);

unix_timestamp:str_time+格式,转为unixtime。

select from_unixtime(1639784618), unix_timestamp('2021-12-18 07:43:38');

Where request_timestamp>unix_timestamp('2021-12-21 23:00:00')*1000

substr

参数:string,start(从1开始),len

select substr('2021-12-23 17:59:00', 1, 10);

2021-12-23

lifecycle:

drop table if exists xxx;

create table if not exists xxx () lifecycle 30 as

Select xxx from xx where xx;

丢掉生命周期:alter table xxxx disable lifecycle;

row_number

row_number() over() as id:生成唯一id

ROW_NUMBER() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3

两种及以上函数组合

like + concat:一个字段是否包含另外一个字段

where food_name not like concat('%', keyword, '%')

字符串拆分split

split(str, 正则表达式)

返回数组

regexp_extract:正则匹配

语法:regexp_extract(string subject,string pattern,int index)

index为

0:表示把匹配的所有的都取出来

1:表示把匹配的第一个取出来

抽取中文:

select regexp_replace('2019这里china', '[u4e00-u9fa5]+', "");

例子:

regexp_extract('IloveYou','I(.*?)(You)',1)

select regexp_extract("#35484#14_18537#6829#35137#15_18537#6918#", "#6829#35137#([0-9]+)_", 1) as c1, regexp_extract('IloveYou','I(.*?)(You)',1) as c2;

regexp_extract(tpp_bucket, "#6829#([0-9]+)#([0-9]+)_", 0) as n_bucket

select regexp_extract("#19_18537#0#243995#16_18537#", "18537#0#([0-9]+)#([0-9]+)_", 2) as c1;

不可:

没懂为何不可,或操作不行?:

select regexp_extract('_18537#0#252249#17_18537#6867#33025#3_18537#7318', "18537#0#252249#([13|17])_", 1) as tmp;

COALESCE(REGEXP_EXTRACT(tolower(args),'rank_id(":"|=)([\da-z]+)',2),'') AS rank_id,

例子:hive中regexp_extract的用法总结_Accepted ALL lve-CSDN博客_regexp_extract()

regexp_replace正则替换

语法:regexp_replace(string A,string B,string C)

字符串A中的字符串B, 替换成字符串C

COALESCE(REGEXP_EXTRACT(tolower(args),'xxx(":"|=)([\da-z]+)',2),'') AS xxx

array_contains

group by 

select userId, GROUP_CONCAt(siteId)  siteIds

from xxx

group by xxx

Rank

RANK() OVER(PARTITION BY COL1 ORDER BY COL2) AS COL3

mod:

取余:select mod(29, 3)

like 

if(xxx like "%#6829#35137#%", 1, 0) as isExp

rlike

from xxx and (xx REGEXP ".*#6829#35137#[11|12|13|14|15].*" or xxx REGEXP ".*#6829#32836#[1|2|3|4|5].*")

dt、ds

where ds=to_char(dateadd(to_date('${bizdate}','yyyymmdd'),-2,'dd'),'yyyymmdd')

where ds=MAX_PT('xxxxx')

分布式函数

PERCENT_RANK()

CUME_DIST()

文档:mysql 分布函数 PERCENT_RANK()、CUME_DIST()

Explode

LATERAL VIEW explode

select query, col from xxx LATERAL VIEW explode(split(predict," “)) t AS col where query='奶茶';

LATERAL VIEW posexplode:带有index的explode

select query, idx, col from xxx LATERAL VIEW posexplode(split(predict," ")) t AS idx, col where query='奶茶';

参考:算法人必懂的Hive知识-四道Hive面试&笔试题解析

LATERAL VIEW OUTER explode

为空,依然返回

json、array、map:复杂格式

get_json_object()

get_json_object(tag, '$.category')

get_json_object(被解析字段A,"$.[0].name") column_name

json_tuple

lateral view json_tuple(ss.col,'itemRateId','name','rating') rr as itemRateId,name,rating

{"itemRateId":"81980290","name":"需要餐具","rating":2,"ratingContent":""}

array

Array

Array

select explode(array('A','B','C'));

select tf.* from (select 0) t lateral view explode(array('A','B','C')) tf as col1;

map

str_to_map

select explode(map('A',10,'B',20,'C',30)) as (my_key,my_value);

select tf.* from (select 0) t lateral view explode(map('A',10,'B',20,'C',30)) tf as my_key,my_value;

json+array

单纯的get_json_object、json_tuple不可破,需要加上regexp_replace

不确定json里的数组多长,使用get_json_object(tag, '$.category[0]') as c不可

参考:hive如何解析json数组、如何在 Apache Hive 中解析 Json 数组

代码:

select json_tuple(json, 'website', 'name') as (website, name)
from 
(
    select explode(
        split(
            regexp_replace(
                regexp_replace(
                '[{"website":"https://blog.csdn.net/lz6363","name":"浮云"},{"website":"https://blog.csdn.net/lz6363/article/details/86606379","name":"hive中的开窗函数"},{"website":"https://blog.csdn.net/lz6363/article/details/82470278","name":"利用hive统计连续登陆的用户数"}]', '\}\,\{','};{'
                ),'\[|\]',''
            ),'\;'
        )
    ) as json
) fuyun;

json+子json+array

{"entities": [{"entity_info": {}, "terms": [{"synonym": [], "tag": "食材", "word": "肉沫"}, {"synonym": [], "tag": "食材", "word": "茄子"}, {"synonym": [], "tag": "主体", "word": "饭"}], "entity_name": "肉沫茄子饭", "entity_type": "Food"}]}
+---------+-----+------+
| synonym | tag | word |
+---------+-----+------+
| 0       | 食材 | 肉沫 |
| 0       | 食材 | 茄子 |
| 0       | 主体 | 饭  |
+---------+-----+------+

select json_tuple(json, 'synonym', 'tag', 'word') as (synonym, tag, word)
from 
(
    select explode(
            split(
                regexp_replace(
                    regexp_replace(
                    regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
                    ),'\[|\]',''
                ),'\;'
            )
        ) as json
    from
    (
        select concat('[{"',
            regexp_extract(
                regexp_replace(
                    regexp_replace(item_understanding, '\[\{\"', '-----')
                , '\"\}\]', '_____')
            , '-----(synonym.*?)_____', 1) 
        , '"}]') as _array
        from xxx
    )
);

+------------+-----+------+

| food_id    | tag | word |

+------------+-----+------+

| 1342782581 | 食材 | 肉沫 |

| 1342782581 | 食材 | 茄子 |

| 1342782581 | 主体 | 饭  |

+------------+-----+———+

select food_id
    , get_json_object(json, '$.tag') as tag
    , get_json_object(json, '$.word') as word
from 
(
    select food_id, json
    from
    (
        select food_id, concat('[{"',
            regexp_extract(
                regexp_replace(
                    regexp_replace(item_understanding, '\[\{\"', '-----')
                , '\"\}\]', '_____')
            , '-----(synonym.*?)_____', 1) 
        , '"}]') as _array
        from xxx 
    )a
    LATERAL VIEW explode(
            split(
                regexp_replace(
                    regexp_replace(
                    regexp_replace(_array, '\[\]', '0'), '\}\, \{','};{'
                    ),'\[|\]',''
                ),'\;'
            )
        )t as json
);

UDTF

LATERAL VIEW udtf(expression) tableAlias AS columnAlias (',' columnAlias)*

聚合函数:

局部变量

set a=1;

博客:MySQL变量的使用 - Brambling - 博客园

select pagecityid, keyword,

        count(DISTINCT queryid) as keywordTimes,

        count(DISTINCT case when regexp_extract(reserves["record_list"], ":slotId=(.*?):",1) ="10011" then queryid end) as keywordAsAdsWordTimes

from xxxx

where hp_cal_dt BETWEEN "2017-11-01" and "2017-11-07"

group by 1,2

having keywordAsAdsWordTimes>=350

参考:Hive中常见的字符串函数:Hive常用字符串函数 – 过往记忆

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/710291.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号