Hive_大数据系统

Hive##

**
作为第一篇博客,本章主要回顾学习的hive相关知识—hiveQL

– 创建表
create table tabelName(order_id String,user_id String)
row format delimited fields terminated by “,”
line terminated by “n”;

– 导入数据
load data local inpath “/home/order_id.csv” overwrite into table orders

– 删除表
drop table if exists orders
– 创建外部表
create external table orders (order_id String)
row format delimited fields terminated by “,”
line terminated by “n”
location “hdfs:\data”
– 创建分区表,多个分区即含有多个字段
create table orders(order_id String)
partitioned by(order_id String,user_id)

– 向分区中导入本地数据 ----HDFS 不加local
load data local inpath “hoemorders.csv” into table orders partition by(dt =“2018”)
– 导入本地数据是复制,导入HDFS数据是移动
– 向表中插入单条数据
insert into table t1 values("…","…")
– 向表中插入多条数据
insert into | overwrite table t1 select * from orders
– 导出数据到HDFS
insert overwrite directory “/root/data”
row format delimited fields terminated by “,”
line terminatted by “n”
select * from orders
insert overwrite local dictory “XXX”
– 导入数据到本地磁盘,加个local
insert overwrite local directory “/root”
row format delimited fields terminated by “,”
line terminated by “n”
– hive 支持文件存储成text parquet sequence格式
create table orders (order_id String,User_id String)
row format delimited fileds terminated by “,”
stored as textfile | sequencefile | parquetfile
– 从文本中查询数据插入sequencefile表
insert into tablename
select * from orders
– 日期类型有date 适用于创表时2021-9-14的类型
create table orders (order_id String,birthday date)
row format delimited fileds terminated by “,”
– 创建表的时候可以创建数组类型
create table orders (order_id String,user_id array)
row format delimated fileds termianated by “,”
– 创建表的时候可以创建Map类型
– 数据格式如下
– 1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
– 2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
create table orders (order_id String,user_id map)
row format delimited fields termianated by “,”
collection items terminated by “#”
map keys terminated by “:”
– 修改表明
alter table t1 rename to t2
– 修改分区名
alter table t1_partition
partition(department=‘xiangsheng’,sex=‘male’,howold=20)
rename to partition(department=‘1’,sex=‘1’,howold=20);
– 添加分区
alter table t1 add partition (department=‘xiangsheng’)
– 删除分区
alter table t1 drop partition(department =‘xiangsheng’)
– 修改列名
alter table t1 change col_old_name col_new_name col_new_type first;
– 增加/替换列
alter table t1 add columns (sex string,addr string);
alter table t1 replace columns (id string,age int,price float);

– 为什么where必须写在group by的前面，为什么group by后面的条件只能用having
– 因为，where是用于在真正执行查询逻辑之前过滤数据用的
– having是对group by聚合之后的结果进行再过滤；

– 基本模式
select * from 表 join on where group by having
– 常用关键词
select drop alter insert create

– 常用内置函数,
– 1.类型强制转化函数
select cast(“5” as Int) as mun from orders;
– 2.数学运算函数:
select greatest(3,5,6) from dual; ## 6 比较常用
– 薪水有三个,求查出来的每一行的数据中最大的一个,对行操作
– max min 对列操作
select greatest(cast(s1 as double),cast(s2 as double),cast(s3 as double)) from t_fun2;

结果：
±--------±-+
| _c0 |
±--------±-+
| 2000.0 |
| 9800.0 |
±--------±-+
– 求每一行的最小值,类比max min
select least(3,5,6) from dual;
select greatest(3,5,6) from dual; ## 6
select round(5.4) from dual; ## 5
select round(5.1345,3) from dual; ##5.135
select ceil(5.4) from dual; // select ceiling(5.4) from dual; ## 6
select floor(5.4) from dual; ## 5
select abs(-5.4) from dual; ## 5.4

– 3.字符串函数
– 截取子串,进行匹配或者模糊查询 substr == substring
select substr(“abcdefs”,int start(开始位置),int len(截取长度))
from orders
– 字符串拼接 concat不能指定拼接的分隔符,但是concat_ws可以
– concat_ws 是coancat with separator
select concat(“String_A”,“String_B”) from orders;
select concat_ws(separator,String1,String2) from orders;

– 返回字符串A的长度
length(String A)
size(String_A)
– 分割字符串,按照pat字符串分割str,返回一个array,注意_ . ?等正则表达式需要加转义
split( String str,String pat)
– 大小写转化
upper(str) lower(str)
– 4.时间函数
– 获取当前时间的毫秒数时间戳
select unix_timestamp();
unix时间戳转字符串 from_unixtime(bigint unixtime[, string format])
select from_unixtime(unix_timestamp());
select from_unixtime(unix_timestamp(),“yyyy/MM/dd HH:mm:ss”);
字符串转unix时间戳 unix_timestamp(string date, string pattern)
select unix_timestamp(“2017-08-10 17:50:30”);
select unix_timestamp(“2017/08/10 17:50:30”,“yyyy/MM/dd HH:mm:ss”);
– 将字符串转化成日期date
select to_date(“2017-09-17 16:58:32”);

– 5.表生成函数
– 5.1行转列函数:explode(),使用explode()对数组字段“炸裂”

select user_id,order_value,order_id
from lie_col
lateral view explode(split(order_value(所要爆炸的列),’,’)) num as order_id
limit 10;
id name subject
1 zhangsan 化学:物理:数学:语文
2 lisi 化学:数学:生物:生理:卫生
3 wangwu 化学:语文:英语:体育:生物

select explode(subject) from orders 查询完之后还可以使用distinct去重

结果:
化学
物理
数学
语文
化学
数学
生物
– 5.2表生成函数 lateral view
select id,name,tmp.sub
from orders
lateral view explode(subject)tmp as sub

结果:
id name tmp.sub
1 zhangsan 化学
1 zhangsan 物理
1 zhangsan 数学
1 zhangsan 语文
1 lisi 化学
1 lisi 数学
1 lisi 生物

理解： lateral view 相当于两个表在join
左表：是原表
右表：是explode(某个集合字段)之后产生的表
而且：这个join只在同一行的数据间进行
那样，可以方便做更多的查询：
比如，查询选修了生物课的同学
select a.id,a.name,a.sub from
(select id,name,tmp.sub as sub
from t_stu_subject
lateral view explode(subjects) tmp as sub) a
where sub=‘生物’;
– 5.3 列转行collect_list与collect_set
select user_id,
concat_ws(’,’,collect_list(order_id)) as order_value
from col_lie
group by user_id
limit 10;
collect_list与collect_set
它们都是将分组中的某列转为一个数组返回，不同的是collect_list不去重
而collect_set去重。

– 6.集合函数
– array_contains(Array,value),用于条件判断
select *
from login
where dt=20130101
and !array_contains(split(‘3.1,3.2,4.0,5.2’,’,’),ver);

where dt=‘20130101’
and (ver !=‘3.1’ and ver !=‘3.2’
and ver != ‘4.0’
and ver != ‘5.2’);
– sort_array不支持倒序排列,常与collect_list连用,group by连用
sort_array(Array) 返回排序后的数组
sort_array(collect_list(play_duration_ms))

– size(Array) 返回一个int值
– size(Map) 返回一个int值
– map_keys(Map) 返回一个数组
– map_values(Map) 返回一个数组

– 7.条件控制函数
– 1.case when
CASE [ expression ]
WHEN condition1 THEN result1
WHEN condition2 THEN result2
ELSE result
END
示例:
select id,name,
case
when age<28 then ‘youngth’
when age>27 and age<40 then ‘zhongnian’
else ‘old’
end
from t_user;

– -- 2.if
– If函数:if和case差不多，都是处理单个列的查询结果
– 语法: if(boolean testCondition, T valueTrue, T valueFalseOrNull)
– 说明: 当条件testCondition为TRUE时，返回valueTrue；否则返回valueFalseOrNull
select id ,if(age >25,“working”,“worked”) from orders;

– 8.json 解析函数
– json_tuple函数用来处理json数据
select
json_tuple(json,‘movie’(JSON中的key字段),‘rate’(JSON中的key字段),‘time’,‘userid’)
as (movie_id(DF字段),rate(DF字段),time,user_id)
from rating_json(是一个JSON文件) limit 10;

– url解析函数
parse_url_tuple()

– 9.分析函数 row_number() over()----用于分组topN 分组查询
需求:
有如下数据,实现查询每种性别中年龄最大的两条数据,分组查询
1,18,a,males
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female
实现:
使用row_mumber()over()函数,对性别进行分组,对年龄进行倒序排列,
select id, age, name, sex from
(select id, age, name, sex,
row_number()over(partition by sex order by age desc) as rank
from orders) tmp
where rank <= 2

10.开窗函数只有over(), 但是over经常与row_number rank等分析函数使用,很少单独使用
基本格式:分析函数 over(partition by 列名 order by 列名 rows between 开始位置 and 结束位置)
与窗口函数over()一起使用的分析函数有如下几类:
a.聚合类(常用)
avg()、sum()、max()、min(),count(),first_value(),last_value()
count() over(partition by … order by …)：求分组后的总数。
　　max() over(partition by … order by …)：求分组后的最大值。
　　min() over(partition by … order by …)：求分组后的最小值。
　　avg() over(partition by … order by …)：求分组后的平均值。
　　lag() over(partition by … order by …)：取出前n行数据。　　
　　lead() over(partition by … order by …)：取出后n行数据。

first_value() over()和last_value() over()的使用,分别求出第一个和最后一个成绩

select t.name,t.class,t.sroce,first_value(t.sroce) over(partition by t.class order by t.sroce desc) mm from T2_TEMP t;
select t.name,t.class,t.sroce,last_value(t.sroce) over(partition by t.class order by t.sroce desc) mm from T2_TEMP t;
b.排名类(最常用)
row_number() --按照值排序时产生一个自增编号，不会重复（如：1、2、3、4、5、6）
rank() --按照值排序时产生一个自增编号，值相等时会重复，会产生空位（如：1、2、3、3、3、6）
dense_rank() --按照值排序时产生一个自增编号，值相等时会重复，不会产生空位（如：1、2、3、3、3、4
c.其他类
lag --(列名,往前的行数,[行数为null时的默认值，不指定为null])，可以计算用户上次购买时间，或者用户下次购买时间。
lead --(列名,往后的行数,[行数为null时的默认值，不指定为null])
ntile(n) --把有序分区中的行分发到指定数据的组中，各个组有编号，编号从1开始，对于每一行，ntile返回此行所属的组的编号

11.partition by 和 group by 的区别:
partition by 不改变数据行数，原来多少行还是多少行，
group by 改变行数，只保留了group by 之后的结果,一般和聚合函数一起使用例如max、min、sum、avg、count等一块用
在求第一名成绩的时候，不能用row_number()，
因为如果同班有两个并列第一，row_number()只返回一个结果。

transform案例:

1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath ‘/home/hadoop/rating.json’ into table rat_json;

2、需要解析json数据成四个字段，插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,’ . m o v i e ′ ) a s m o i v e , g e t j s o n o b j e c t ( l i n e , ′ .movie') as moive,get_json_object(line,' .movie′)asmoive,getjsonobject(line,′.rate’) as rate from rat_json;

Hive

大数据系统相关栏目本月热门文章