hive UDF函数取最新分区

hive UDF函数取最新分区 1.pom文件


    
    
      org.apache.hive
      hive-exec
      1.2.1
    
    
      org.apache.hadoop
      hadoop-client
      2.7.6
      
        
          log4j
          log4j
        
        
          org.slf4j
          slf4j-log4j12
        
      
    
    
      junit
      junit
      4.10
      test
    
  

  
    
      
        
          org.apache.maven.plugins
          maven-compiler-plugin
          3.0
          
            1.8
            1.8
            UTF-8
          
        
        
          org.apache.maven.plugins
          maven-shade-plugin
          2.2
          
            
              package
              
                shade
              
              
                
                  
                    *:*
                    
                      meta-INFRSA

2.代码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;

import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;



public class latest_partition extends UDF {

    public String evaluate(String tableName) {
        StringBuffer sb = new StringBuffer();
        String latesttPatition = null;
        // 获取shema
        String split1 = tableName.split("\.")[0];
        // 获取table_name
        String split2 = tableName.split("\.")[1];
        // 拼接路径
        String fileName = sb.append("/user/hive/warehouse/").append(split1).append(".db/").append(split2).toString();
        try{
        // 调用方法获取最新分区
            latesttPatition = getFileList(fileName);
        }catch (Exception e){
            System.out.println("获取结果异常" +e.getMessage());
        }
        return latesttPatition;
    }

// 获取最新分区
    public static String getFileList(String path) throws Exception{
        String res = null;
        Configuration conf=new Configuration(false);
        conf.set("fs.default.name", "hdfs://hacluster/");
        FileSystem hdfs = FileSystem.get(URI.create(path),conf);
        FileStatus[] fs = hdfs.listStatus(new Path(path));
        Path[] listPath = FileUtil.stat2Paths(fs);

        List list = new ArrayList();
        for(Path p : listPath){
            String s = p.toString();
           // hdfs上有可能有非分区文件，只处理分区文件
            if(s.contains("=")) {
                String partition = s.split("=")[1];
                list.add(partition);
            }
        }
        if(list.size() != 0) {
            res = Collections.max(list).toString();
        }
        return  res;
    }

}

大表查询最新分区往往由于各种原因，可能需要几个小时，使用该函数可以实现秒级返回数据。性能可大范围提升。

-- 优化前sql查询语句(耗时特别久，全表扫描)
SELECt MAX(dt) as latest_dt FROM table_name;
-- 优化后(通过文件系统查询，数秒返回结果)
SELECT LST_DT('schema.table_name');

hive UDF函数取最新分区

大数据系统相关栏目本月热门文章