2.代码org.apache.hive hive-exec 1.2.1 org.apache.hadoop hadoop-client 2.7.6 log4j log4j org.slf4j slf4j-log4j12 junit junit 4.10 test org.apache.maven.plugins maven-compiler-plugin 3.0 1.8 1.8 UTF-8 org.apache.maven.plugins maven-shade-plugin 2.2 package shade *:* meta-INFRSA
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class latest_partition extends UDF {
public String evaluate(String tableName) {
StringBuffer sb = new StringBuffer();
String latesttPatition = null;
// 获取shema
String split1 = tableName.split("\.")[0];
// 获取table_name
String split2 = tableName.split("\.")[1];
// 拼接路径
String fileName = sb.append("/user/hive/warehouse/").append(split1).append(".db/").append(split2).toString();
try{
// 调用方法获取最新分区
latesttPatition = getFileList(fileName);
}catch (Exception e){
System.out.println("获取结果异常" +e.getMessage());
}
return latesttPatition;
}
// 获取最新分区
public static String getFileList(String path) throws Exception{
String res = null;
Configuration conf=new Configuration(false);
conf.set("fs.default.name", "hdfs://hacluster/");
FileSystem hdfs = FileSystem.get(URI.create(path),conf);
FileStatus[] fs = hdfs.listStatus(new Path(path));
Path[] listPath = FileUtil.stat2Paths(fs);
List list = new ArrayList();
for(Path p : listPath){
String s = p.toString();
// hdfs上有可能有非分区文件,只处理分区文件
if(s.contains("=")) {
String partition = s.split("=")[1];
list.add(partition);
}
}
if(list.size() != 0) {
res = Collections.max(list).toString();
}
return res;
}
}
大表查询最新分区往往由于各种原因,可能需要几个小时,使用该函数可以实现秒级返回数据。性能可大范围提升。
-- 优化前sql查询语句(耗时特别久,全表扫描)
SELECt MAX(dt) as latest_dt FROM table_name;
-- 优化后(通过文件系统查询,数秒返回结果)
SELECT LST_DT('schema.table_name');



