- 转换函数:
select cast(12.35 as int) select cast(‘20190607’as int) select cast(‘2020-1-15 ‘ as date) insert overwrite table t1 select tid,cast (tage as int )from t2;
- 行转列
concat(str1,str2),字段拼接
concat_ws(sep ,str1,str2)以分隔符拼接每个字符串
collect_set(col)将某字段的值去重汇总,产生array类型字段
create table emp(
deptno int,
ename string
)row format delimited fields terminated by ',';
load data local inpath'/export/data/hivedata/emp.data' into table emp;
--进行转换
select collect_set(ename) from emp;--将ename的所有数据转换为数组的一个元素
select deptno, collect_set(ename) from emp group by deptno;
select deptno, concat_ws('|', collect_set(ename)) from emp group by deptno;
爆炸函数:select explode(列) from emp2;
侧视图:lateral view
select deptno ,name from emp2 lateral view explode(names) tmp_tb as name;
3.reflect函数:可以支持在sql中调用Java中的静态函数
select reflect("java.lang.Math","max",col,col2) from test25;
select replace(reflect("java.util.UUID","randomUUID"),'-','');
- 开窗函数(窗口函数):
4.1 row_number ,rank,dense_rank分组排序
数据准备:
cookie1,2018-04-10,1 cookie1,2018-04-11,5 cookie1,2018-04-12,7 cookie1,2018-04-13,3 cookie1,2018-04-14,2 cookie1,2018-04-15,4 cookie1,2018-04-16,4 cookie2,2018-04-10,2 cookie2,2018-04-11,3 cookie2,2018-04-12,5 cookie2,2018-04-13,6 cookie2,2018-04-14,3 cookie2,2018-04-15,9 cookie2,2018-04-16,7
create table itcast
(
cookid string,
creatime string,
pv int
) row format delimited fields terminated by ',';
load data local inpath '/export/data/hivedata/kaichuang.data' into table itcast;
select *
from itcast;
根据用户对页面访问次数排序
select * from itcast distribute by cookid sort by pv desc ;--哈希值相同但值不同
select * ,row_number() over (partition by cookid order by pv desc )as rk from itcast;1 2 3 4 5
select * ,rank() over (partition by cookid order by pv desc) from itcast;--有并列1 2 3 3 5 6
select * ,dense_rank() over (partition by cookid order by pv desc) from itcast;--1 2 3 4 5 5 6
查询每个cookie的前三名:
select * from(select * ,dense_rank() over (partition by cookid order by pv ) as rk from itcast )t where t.rk<4;
4.2SUM,AVG,MIN,MAX
select * ,sum(pv)over (partition by cookid order by creatime)as sum from itcast2;--累加历史数据
select * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and current row )
as sum from itcast2;--从前三开始累加到当前行,如果没有前三行,则不管有多少加多少
select * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and 1 following )
as sum from itcast2;--前三后一
select * ,sum(pv)over (partition by cookid order by creatime rows between current row and unbounded following )
as sum from itcast2;--当前行累加到最后
- 自定义函数:
5.1自定义函数分类:UDF一进一出lower、upper;UDAF聚集函数,多进一出count、max; UDTF一进多出lateral view explode()
5.2自定义UDF:编程步骤:继承org.apache.hadoop.hive.ql.exec.UDF;需要实现evaluate函数;evaluate函数支持重载;
注:UDF必须要有返回类型,可以返回null,但是返回类型不能为void;UDF中常用Text/LongWritable等类型,不推荐使用java类型
代码编写:
导入maven依赖:
org.apache.hive hive-exec2.1.0 org.apache.hadoop hadoop-common2.7.5
编写代码:
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class MyUdf extends UDF {
public Text evaluate(Text line){
if (line!=null&& !line.equals("")){
String newline=line.toString().toLowerCase();
return new Text(newline);}
else {
return null;
}
}
}
打包,并将打好的包上传到/export/server/hive-2.1.0/lib/下。
hive的客户端添加jar包
add jar /export/server/hive-2.1.0/lib/包名
例子:
add jar /export/server/hive-2.1.0/lib/day18_hive-1.0-SNAPSHOT.jar;
设置函数与我们的自定义函数关联(临时函数只在当前回话起作用,退出函数不存在)
create temporary function my_lower as 'cn.hlzq.hive.udf.MyUdf'
调用
select my_lower("HELLijJJ");
永久函数:把自定义的函数jar包上传到hdfs中,
把jar包上传到hadoop
hadoop fs -put day18_hive-1.0-SNAPSHOT.jar /hive_func
创建永久函数:
create function my_lower as 'cn.hlzq.hive.udf.MyUdf' using jar 'hdfs://192.168.88.161:8020/hive_func/day18_hive-1.0-SNAPSHOT.jar';
hive删除自定义函数:
drop (temporary) function if exists 函数名;
5.3自定义UDTF函数(一进多出):
package cn.hlzq.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.function.ObjDoubleConsumer;
public class MyUDTF extends GenericUDTF {
private final transient Object[] forwardListObj = new Object[1];
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//设置列名的类型
List fieldNames = new ArrayList<>();
//设置列名
fieldNames.add("column_01");
List fieldOIs = new ArrayList() ;//检查器列表
//设置输出的列的值类型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] objects) throws HiveException {
//1:获取原始数据
String args = objects[0].toString();
//2:获取数据传入的第二个参数,此处为分隔符
String splitKey = objects[1].toString();
//3.将原始数据按照传入的分隔符进行切分
String[] fields = args.split(splitKey);
//4:遍历切分后的结果,并写出
for (String field : fields) {
//将每一个单词添加值对象数组
forwardListObj[0] = field;
//将对象数组内容写出
forward(forwardListObj);
}
}
@Override
public void close() throws HiveException {
}
}
打包上传
- hive的数据压缩(节省网络带宽)
1)开启hive中间传输数据压缩功能 hive(default)>set hive.exec.compress.intermediate=true; 2)开启mapreduce中map输出压缩功能 hive (default)>set mapreduce.map.output.compress=true; 3)设置mapreduce中map输出数据的压缩方式 hive (default)>set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
-- 1)开启hive最终输出数据压缩功能 set hive.exec.compress.output=true; -- 2)开启mapreduce最终输出数据压缩 set mapreduce.output.fileoutputformat.compress=true; -- 3)设置mapreduce最终数据输出压缩方式 set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec; -- 4)设置mapreduce最终数据输出压缩为块压缩 set mapreduce.output.fileoutputformat.compress.type=BLOCK; -- 5)测试一下输出结果是否是压缩文件 insert overwrite local directory '/export/data/exporthive/compress' select * from score distribute by sid sort by sscore desc;
- hive的存储:行存储,TEXTFILE,SEQUENCEFILE
列存储:好ORC PARQUET
create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
- hive的reduce个数设置
8.1手动设置:set mapreduce.job.reduces=3;
8.2hive 自行判断reduce的个数 set mapreduce。job.reduces=-1;hive会根据输入文件的大小估算出reduce的个数。
set hive.exec.reducers.bytes.per.reducer;每个reduce任务处理的数据值:256m
set hive.exec.reducers.max;每个任务最大的reduce数:1009
计算reduce数的公式:n=min(参数2 set hive.exec.reducers.max,总输入数据量/set hive.exec.reducers.bytes.per.reduce)
count (*),order by 笛卡尔积,只能有一个reduce 必须group by字段;



