day18hive_大数据系统

一.hive的函数

转换函数：

select cast（12.35 as int）
select cast（‘20190607’as int）
select cast（‘2020-1-15 ‘ as date）
insert overwrite table t1 select tid,cast (tage  as int )from t2;

create table  emp(
    deptno int,
    ename string
)row format delimited fields terminated by ',';
load data  local inpath'/export/data/hivedata/emp.data' into table emp;
--进行转换
select  collect_set(ename) from emp;--将ename的所有数据转换为数组的一个元素
select deptno,  collect_set(ename) from emp group by deptno;
select deptno,  concat_ws('|', collect_set(ename)) from emp group by deptno;

爆炸函数：select explode(列) from emp2;
侧视图：lateral view

select deptno ,name from emp2 lateral view  explode(names) tmp_tb  as name;

3.reflect函数：可以支持在sql中调用Java中的静态函数

select  reflect("java.lang.Math","max",col,col2) from test25;
select  replace(reflect("java.util.UUID","randomUUID"),'-','');

cookie1,2018-04-10,1
cookie1,2018-04-11,5
cookie1,2018-04-12,7
cookie1,2018-04-13,3
cookie1,2018-04-14,2
cookie1,2018-04-15,4
cookie1,2018-04-16,4
cookie2,2018-04-10,2
cookie2,2018-04-11,3
cookie2,2018-04-12,5
cookie2,2018-04-13,6
cookie2,2018-04-14,3
cookie2,2018-04-15,9
cookie2,2018-04-16,7

create table itcast
(
    cookid   string,
    creatime string,
    pv       int
) row format delimited fields terminated by ',';
load data local inpath '/export/data/hivedata/kaichuang.data' into table itcast;
select *
from itcast;

根据用户对页面访问次数排序

select  * from itcast distribute by cookid sort by pv desc ;--哈希值相同但值不同

select  * ,row_number() over (partition by cookid order by pv desc )as  rk from itcast;1 2 3 4 5

  select  * ,rank() over (partition by cookid order by pv desc) from  itcast;--有并列1 2 3 3 5 6

select  * ,dense_rank()  over (partition by cookid order by pv desc) from  itcast;--1 2 3 4 5 5 6

查询每个cookie的前三名：

select * from(select  * ,dense_rank() over (partition by cookid order by pv ) as rk from itcast )t 
where t.rk<4;

4.2SUM,AVG,MIN,MAX

select  * ,sum(pv)over (partition by cookid order by creatime)as sum from itcast2;--累加历史数据

select  * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and current row )
    as sum from itcast2;--从前三开始累加到当前行，如果没有前三行，则不管有多少加多少

select  * ,sum(pv)over (partition by cookid order by creatime rows between 3 preceding and 1 following )
    as sum from itcast2;--前三后一

select  * ,sum(pv)over (partition by cookid order by creatime rows between current row and unbounded  following )
    as sum from itcast2;--当前行累加到最后


    
        org.apache.hive
        hive-exec
        2.1.0
    
    
        org.apache.hadoop
        hadoop-common
        2.7.5

编写代码：

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class MyUdf extends UDF {
   public Text evaluate(Text line){
       if (line!=null&& !line.equals("")){
       String newline=line.toString().toLowerCase();
       return new Text(newline);}
       else {
           return null;
       }
   }
}

打包，并将打好的包上传到/export/server/hive-2.1.0/lib/下。
hive的客户端添加jar包
add jar /export/server/hive-2.1.0/lib/包名
例子：

add jar /export/server/hive-2.1.0/lib/day18_hive-1.0-SNAPSHOT.jar;

设置函数与我们的自定义函数关联（临时函数只在当前回话起作用，退出函数不存在）

create  temporary  function my_lower as 'cn.hlzq.hive.udf.MyUdf'

调用

select my_lower("HELLijJJ");

永久函数：把自定义的函数jar包上传到hdfs中，
把jar包上传到hadoop

hadoop fs -put day18_hive-1.0-SNAPSHOT.jar /hive_func

创建永久函数：

create function my_lower as 'cn.hlzq.hive.udf.MyUdf' using  jar 'hdfs://192.168.88.161:8020/hive_func/day18_hive-1.0-SNAPSHOT.jar';

hive删除自定义函数：

drop （temporary） function if exists 函数名；

5.3自定义UDTF函数（一进多出）：

package cn.hlzq.hive.udtf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.function.ObjDoubleConsumer;

public class MyUDTF extends GenericUDTF {
    private final transient Object[] forwardListObj = new Object[1];

    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        //设置列名的类型
        List fieldNames = new ArrayList<>();
        //设置列名
        fieldNames.add("column_01");
        List fieldOIs = new ArrayList()  ;//检查器列表

        //设置输出的列的值类型
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);

    }

    @Override
    public void process(Object[] objects) throws HiveException {
        //1:获取原始数据
        String args = objects[0].toString();
        //2:获取数据传入的第二个参数，此处为分隔符
        String splitKey = objects[1].toString();
        //3.将原始数据按照传入的分隔符进行切分
        String[] fields = args.split(splitKey);
        //4:遍历切分后的结果，并写出
        for (String field : fields) {
            //将每一个单词添加值对象数组
            forwardListObj[0] = field;
            //将对象数组内容写出
            forward(forwardListObj);
        }

    }

    @Override
    public void close() throws HiveException {

    }
}

打包上传

hive的数据压缩（节省网络带宽）

 1）开启hive中间传输数据压缩功能
hive(default)>set hive.exec.compress.intermediate=true;

2）开启mapreduce中map输出压缩功能
hive (default)>set mapreduce.map.output.compress=true;

3）设置mapreduce中map输出数据的压缩方式
hive (default)>set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;

-- 1）开启hive最终输出数据压缩功能
set hive.exec.compress.output=true;
-- 2）开启mapreduce最终输出数据压缩
set mapreduce.output.fileoutputformat.compress=true;
-- 3）设置mapreduce最终数据输出压缩方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
-- 4）设置mapreduce最终数据输出压缩为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
-- 5）测试一下输出结果是否是压缩文件
insert overwrite local directory '/export/data/exporthive/compress' select * from score distribute by sid sort by sscore desc;

 create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");

day18hive

大数据系统相关栏目本月热门文章