题目:
输入1、2位两个input文件,输入3为stopwords文件,输入4为输出目录;对input文件按照“(space)tnrf”进行分词,输出两个input中均出现、且次数较少的单词及数量;并排除掉stopwords中出现的单词,结果按词频降序排序,只展示top20;
输出结果如下:
287 I
44 It
27 But
23 The
17 There
17 He
17 And
15 will
14 good
14 If
12 it.
11 great
10 things
9 This
9 well
9 room
9 feel
8 long
8 You
8 thing
代码总览
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TopkCommonWords {
public static class MapOne extends Mapper