hadoop 分词、排序、停用词、筛选、topK

题目：

输入1、2位两个input文件，输入3为stopwords文件，输入4为输出目录；对input文件按照“(space)tnrf”进行分词，输出两个input中均出现、且次数较少的单词及数量；并排除掉stopwords中出现的单词，结果按词频降序排序，只展示top20；

输出结果如下：

287 I
44 It
27 But
23 The
17 There
17 He
17 And
15 will
14 good
14 If
12 it.
11 great
10 things
9  This
9  well
9  room
9  feel
8  long
8  You
8  thing

代码总览

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TopkCommonWords {
    public static class MapOne extends Mapper {

        private Set stopwords;
        private String localFiles;
        private final static IntWritable one = new IntWritable(1);

        @Override
        public void setup(Context context) throws IOException {
            stopwords = new TreeSet<>();
            Configuration conf = context.getConfiguration();
            localFiles = conf.getStrings("stopwords")[0];
            FileSystem fs = FileSystem.get(URI.create(localFiles), conf);
            FSDataInputStream hdfsInStream = fs.open(new Path(localFiles));
            InputStreamReader isr = new InputStreamReader(hdfsInStream, "utf-8");
            String line;
            BufferedReader br = new BufferedReader(isr);
            while ((line = br.readLine()) != null) {
                StringTokenizer itr = new StringTokenizer(line);
                while (itr.hasMoreTokens()) {
                    stopwords.add(itr.nextToken());
                }
            }
        }

        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            String fileName = inputSplit.getPath().getName();
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                String word = itr.nextToken();
                if (!stopwords.contains(word)) {
                    context.write(new Text(fileName + "t" + word), one);
                }
            }
        }
    }

    public static class ReduceOne extends Reducer {

        IntWritable result = new IntWritable();

        @Override
        public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static class MapTwo extends Mapper {
        Map compSet = new HashMap();
        String curKeyFile = null;

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] data = value.toString().split("t");
            String keyFile = data[0];
            String keyword = data[1];
            int num = Integer.parseInt(data[2]);
            if (compSet.isEmpty()) {
                curKeyFile = keyFile;
            }
            if (keyFile.equals(curKeyFile)) {
                compSet.put(keyword, num);
            } else {
                if (compSet.containsKey(keyword)) {
                    int sum = compSet.get(keyword);
                    if (sum <= num) {
                        context.write(new IntWritable(sum), new Text(keyword));
                    } else {
                        context.write(new IntWritable(num), new Text(keyword));
                    }
                }
            }
        }
    }

    public static class ReduceTwo extends Reducer {


        private static final int maxNum = 20;
        private static List> list = new ArrayList();

        @Override
        protected void reduce(IntWritable key, Iterable values, Context context) {
            for (Text text : values) {
                Map map = new HashMap();
                map.put(key.get(), text.toString());
                list.add(map);
                if(list.size() > maxNum){
                    list.remove(list.size() - 1);
                }
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            String path = context.getConfiguration().get("topKout");
            System.out.println("cleanup:" + path);
            System.out.println("size:" + list.size());
            for (Map map : list) {
                for (Map.Entry entry : map.entrySet()) {
                    context.write(new IntWritable(entry.getKey()), new Text(entry.getValue()));
                }
            }
        }
    }

    public static class Sort extends IntWritable.Comparator{
        @Override
        public int compare(WritableComparable a, WritableComparable b){
            return -super.compare(a, b);
        }

        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return -super.compare(b1, s1, l1, b2, s2, l2);
        }
    }

    public static void main(String[] args) throws Exception {
        String out = args[3];
        String tmpout = "";
        if (out.endsWith("/")) {
            tmpout = out.substring(0, out.length() - 2) + "tmp/";
        }
        Configuration conf1 = new Configuration(true);
        conf1.setStrings("stopwords", args[2]);
        // job1
        System.out.println("job1");
        Job job1 = Job.getInstance(conf1, "world count");
        job1.setJarByClass(TopkCommonWords.class);
        job1.setMapperClass(MapOne.class);
        job1.setReducerClass(ReduceOne.class);
        job1.setInputFormatClass(TextInputFormat.class);
        FileInputFormat.addInputPath(job1, new Path(args[0]));
        FileInputFormat.addInputPath(job1, new Path(args[1]));
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(IntWritable.class);
        FileOutputFormat.setOutputPath(job1, new Path(tmpout));

        if(job1.waitForCompletion(true)) {

            //job2
            System.out.println("job2");
            Configuration conf2 = new Configuration(true);
            conf2.set("topKout", out);
            Job job2 = Job.getInstance(conf2, "sort");
            job2.setJarByClass(TopkCommonWords.class);
            job2.setMapperClass(MapTwo.class);
            job2.setReducerClass(ReduceTwo.class);
            job2.setInputFormatClass(TextInputFormat.class);
            job2.setSortComparatorClass(Sort.class);
            FileInputFormat.addInputPath(job2, new Path(tmpout));
            job2.setOutputKeyClass(IntWritable.class);
            job2.setOutputValueClass(Text.class);

            FileOutputFormat.setOutputPath(job2, new Path(out));
            if (job2.waitForCompletion(true) ) {
                FileSystem fs = FileSystem.get(URI.create(tmpout), conf2);
                fs.delete(new Path(tmpout), true);
            }
        }

    }
}

hadoop 分词、排序、停用词、筛选、topK

大数据系统相关栏目本月热门文章