栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 前沿技术 > 大数据 > 大数据系统

day18hive

day18hive

一.hive的函数
    转换函数:
select cast(12.35 as int)
select cast(‘20190607’as int)
select cast(‘2020-1-15 ‘ as date)
insert overwrite table t1 select tid,cast (tage  as int )from t2;
    行转列

    concat(str1,str2),字段拼接
    concat_ws(sep ,str1,str2)以分隔符拼接每个字符串
    collect_set(col)将某字段的值去重汇总,产生array类型字段
create table  emp(
    deptno int,
    ename string
)row format delimited fields terminated by ',';
load data  local inpath'/export/data/hivedata/emp.data' into table emp;
--进行转换
select  collect_set(ename) from emp;--将ename的所有数据转换为数组的一个元素
select deptno,  collect_set(ename) from emp group by deptno;
select deptno,  concat_ws('|', collect_set(ename)) from emp group by deptno;


爆炸函数:select explode(列) from emp2;
侧视图:lateral view

select deptno ,name from emp2 lateral view  explode(names) tmp_tb  as name;

3.reflect函数:可以支持在sql中调用Java中的静态函数

select  reflect("java.lang.Math","max",col,col2) from test25;
select  replace(reflect("java.util.UUID","randomUUID"),'-','');
    开窗函数(窗口函数):
    4.1 row_number ,rank,dense_rank分组排序
    数据准备:
cookie1,2018-04-10,1
cookie1,2018-04-11,5
cookie1,2018-04-12,7
cookie1,2018-04-13,3
cookie1,2018-04-14,2
cookie1,2018-04-15,4
cookie1,2018-04-16,4
cookie2,2018-04-10,2
cookie2,2018-04-11,3
cookie2,2018-04-12,5
cookie2,2018-04-13,6
cookie2,2018-04-14,3
cookie2,2018-04-15,9
cookie2,2018-04-16,7
create table itcast
(
    cookid   string,
    creatime string,
    pv       int
) row format delimited fields terminated by ',';
load data local inpath '/export/data/hivedata/kaichuang.data' into table itcast;
select *
from itcast;

根据用户对页面访问次数排序

select  * from itcast distribute by cookid sort by pv desc ;--哈希值相同但值不同
select  * ,row_number() over (partition by cookid order by pv desc )as  rk from itcast;1 2 3 4 5
                 
  select  * ,rank() over (partition by cookid order by pv desc) from  itcast;--有并列1 2 3 3 5 6  
select  * ,dense_rank()  over (partition by cookid order by pv desc) from  itcast;--1 2 3 4 5 5 6           

查询每个cookie的前三名:

select * from(select  * ,dense_rank() over (partition by cookid order by pv ) as rk from itcast )t 
where t.rk<4;

4.2SUM,AVG,MIN,MAX

select  * ,sum(pv)over (partition by cookid order by creatime)as sum from itcast2;--累加历史数据
select  * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and current row )
    as sum from itcast2;--从前三开始累加到当前行,如果没有前三行,则不管有多少加多少
select  * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and 1 following )
    as sum from itcast2;--前三后一
select  * ,sum(pv)over (partition by cookid order by creatime rows between current row and unbounded  following )
    as sum from itcast2;--当前行累加到最后
    自定义函数:
    5.1自定义函数分类:UDF一进一出lower、upper;UDAF聚集函数,多进一出count、max; UDTF一进多出lateral view explode()
    5.2自定义UDF:编程步骤:继承org.apache.hadoop.hive.ql.exec.UDF;需要实现evaluate函数;evaluate函数支持重载;
    注:UDF必须要有返回类型,可以返回null,但是返回类型不能为void;UDF中常用Text/LongWritable等类型,不推荐使用java类型
    代码编写:
    导入maven依赖:

    
        org.apache.hive
        hive-exec
        2.1.0
    
    
        org.apache.hadoop
        hadoop-common
        2.7.5
    

编写代码:

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class MyUdf extends UDF {
   public Text evaluate(Text line){
       if (line!=null&& !line.equals("")){
       String newline=line.toString().toLowerCase();
       return new Text(newline);}
       else {
           return null;
       }
   }
}

打包,并将打好的包上传到/export/server/hive-2.1.0/lib/下。
hive的客户端添加jar包
add jar /export/server/hive-2.1.0/lib/包名
例子:

add jar /export/server/hive-2.1.0/lib/day18_hive-1.0-SNAPSHOT.jar;

设置函数与我们的自定义函数关联(临时函数只在当前回话起作用,退出函数不存在)

create  temporary  function my_lower as 'cn.hlzq.hive.udf.MyUdf'

调用

select my_lower("HELLijJJ");

永久函数:把自定义的函数jar包上传到hdfs中,
把jar包上传到hadoop

hadoop fs -put day18_hive-1.0-SNAPSHOT.jar /hive_func

创建永久函数:

create function my_lower as 'cn.hlzq.hive.udf.MyUdf' using  jar 'hdfs://192.168.88.161:8020/hive_func/day18_hive-1.0-SNAPSHOT.jar';

hive删除自定义函数:

drop (temporary) function if exists 函数名;

5.3自定义UDTF函数(一进多出):

package cn.hlzq.hive.udtf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.function.ObjDoubleConsumer;

public class MyUDTF extends GenericUDTF {
    private final transient Object[] forwardListObj = new Object[1];

    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        //设置列名的类型
        List fieldNames = new ArrayList<>();
        //设置列名
        fieldNames.add("column_01");
        List fieldOIs = new ArrayList()  ;//检查器列表

        //设置输出的列的值类型
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);

    }

    @Override
    public void process(Object[] objects) throws HiveException {
        //1:获取原始数据
        String args = objects[0].toString();
        //2:获取数据传入的第二个参数,此处为分隔符
        String splitKey = objects[1].toString();
        //3.将原始数据按照传入的分隔符进行切分
        String[] fields = args.split(splitKey);
        //4:遍历切分后的结果,并写出
        for (String field : fields) {
            //将每一个单词添加值对象数组
            forwardListObj[0] = field;
            //将对象数组内容写出
            forward(forwardListObj);
        }

    }

    @Override
    public void close() throws HiveException {

    }
}

打包上传

    hive的数据压缩(节省网络带宽)
 1)开启hive中间传输数据压缩功能
hive(default)>set hive.exec.compress.intermediate=true;

2)开启mapreduce中map输出压缩功能
hive (default)>set mapreduce.map.output.compress=true;

3)设置mapreduce中map输出数据的压缩方式
hive (default)>set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
-- 1)开启hive最终输出数据压缩功能
set hive.exec.compress.output=true;
-- 2)开启mapreduce最终输出数据压缩
set mapreduce.output.fileoutputformat.compress=true;
-- 3)设置mapreduce最终数据输出压缩方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
-- 4)设置mapreduce最终数据输出压缩为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
-- 5)测试一下输出结果是否是压缩文件
insert overwrite local directory '/export/data/exporthive/compress' select * from score distribute by sid sort by sscore desc;
    hive的存储:行存储,TEXTFILE,SEQUENCEFILE
    列存储:好ORC PARQUET
 create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
    hive的reduce个数设置
    8.1手动设置:set mapreduce.job.reduces=3;
    8.2hive 自行判断reduce的个数 set mapreduce。job.reduces=-1;hive会根据输入文件的大小估算出reduce的个数。
    set hive.exec.reducers.bytes.per.reducer;每个reduce任务处理的数据值:256m
    set hive.exec.reducers.max;每个任务最大的reduce数:1009
    计算reduce数的公式:n=min(参数2 set hive.exec.reducers.max,总输入数据量/set hive.exec.reducers.bytes.per.reduce)
    count (*),order by 笛卡尔积,只能有一个reduce 必须group by字段;
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/730904.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号