Hive自定义函数UDF、UDTF
- hive中已经自带一些函数,但数量有限,有时候需要自己定义函数,自定义函数分为一下三种:
- 1、UDF(User-Defined-Function)
一进一出
类似于:lower/upper/reverse - 2、UDAF(User-Defined Aggregation Function)
聚集函数,多进一出
类似于:count/max/min
3、UDTF(User-Defined Table-Generating Functions)
一进多出
如lateral view explode()
1.自定义UDF
1.1依赖
org.apache.hive
hive-exec
2.1.0
org.apache.hadoop
hadoop-common
2.7.5
1.2代码实现
public class Uppercase extends UDF {
public Text evaluate(final Text s) {
if (null == s) {
return null;
}
//返回大写字母
return new Text(s.toString().toUpperCase());
}
}
1.3 函数使用
1.3.1 临时函数
cd /export/server/hive-2.1.0/lib
mv user-defined-function-1.0-SNAPSHOT.jar my_uppercase.jar
add jar /export/server/hive-2.1.0/lib/my_uppercase.jar;
-- 自定义临时函数
create temporary function my_upercase as 'com.dk.udf.Uppercase';
-- 使用函数
select my_upercase("abcDe");
1.3.2 永久函数
# 在hdfd创建jar包存放的文件夹
hadoop fs -mkdir /hive_func
# 上传jar包
hadoop fs -put /export/server/hive-2.1.0/lib/my_uppercase.jar /hive_func
# 创建永久函数
create function my_upercase2 as 'com.dk.udf.Uppercase'
using jar 'hdfs://node1:8020/hive_func/my_uppercase.jar';
# 使用
select my_upercase2("abcDe");
2. 自定义UDTF
2.1 单列一进多出转换
2.1.1 代码实现
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
public class SplitString extends GenericUDTF {
private final transient Object[] forwardList = new Object[1];
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//初始化字段列表
List fieldNames = new ArrayList<>();
fieldNames.add("column_1");
//初始化字段检查器列表,用于检查字段类型
List inspectors = new ArrayList<>();
//设置第一个字段类型为string
inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, inspectors);
}
@Override
public void process(Object[] args) throws HiveException {
if (args == null || args.length < 1){
super.forward(forwardList);
return;
}
//获取需要拆分的数据
String argsStr = args[0].toString();
//获取字段分隔符
String splitStr = args[1].toString();
//获取拆分后的数组
String[] fields = argsStr.split(splitStr);
for (String field : fields) {
//放入输出字段集合
forwardList[0] = field;
//输出
super.forward(forwardList);
}
}
@Override
public void close() throws HiveException {
}
}
2.1.2 函数使用
add jar /export/server/hive-2.1.0/lib/my_split_string.jar;
create temporary function split_string_udtf as 'com.dk.udtf.SplitString';
select split_string_udtf("索隆,路飞,山治,乔巴", ",");
2.2 多列一进多出转换
2.2.1 代码实现
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class SplitMapList extends GenericUDTF {
private final transient Object[] fieldlist = new Object[2];
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//初始化列名
List fieldList = new ArrayList<>();
fieldList.add("column_1");
fieldList.add("column_2");
//初始化字段检查器,分别对应上面两个输出列的类型
List inspectors = new ArrayList<>();
inspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
inspectors.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector);
//返回列名和字段检查器集合
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldList, inspectors);
}
@Override
public void process(Object[] args) throws HiveException {
if (args == null || args.length < 1){
super.forward(fieldlist);
return;
}
//arg0需要分析的数据
String arg0 = args[0].toString();
//arg1第一个分隔符
String arg1 = args[1].toString();
//arg2第二个分隔符
String arg2 = args[2].toString();
String[] items = arg0.split(arg1);
for (String item : items) {
String[] beans = item.split(arg2);
fieldlist[0] = beans[0];
fieldlist[1] = Long.parseLong(beans[1]);
// fieldlist[1] = beans[1];
super.forward(fieldlist);
}
}
@Override
public void close() throws HiveException {
}
public static void main(String[] args) {
String[] strings = new String[3];
strings[0] = "路飞:12000000000,索隆:8000000000,乔巴:3000000";
strings[1] = ",";
strings[2] = ":";
//arg0需要分析的数据
String arg0 = strings[0].toString();
//arg1第一个分隔符
String arg1 = strings[1].toString();
//arg2第二个分隔符
String arg2 = strings[2].toString();
String[] items = arg0.split(arg1);
for (String item : items) {
String[] beans = item.split(arg2);
System.out.println(Arrays.toString(beans));
}
}
}
2.2.2 函数使用
-- mv user-defined-function-1.0-SNAPSHOT.jar my_split_map.jar
drop function my_split_map;
create function my_split_map as 'com.dk.udtf.SplitMapList'
using jar 'hdfs://node1:8020/hive_func/my_split_map.jar';
select my_split_map("路飞:12000000000,索隆:8000000000,乔巴:3000000", ",", ":");
2.3 删除函数命令
-- 删除临时函数
drop temporary function if exists encryptPhoneNumber;
-- 删除永久函数,不会删除HDFS上的jar包
drop function if exists my_lower2;