- 一、前期准备
- 二、WordCount案例
- 三、射雕英雄传词频案例
1.新建maven项目
2.pom.xml中添加项目依赖
4.0.0 com.hdtrain wordcount 1.0-SNAPSHOT wordcount http://www.example.com UTF-8 1.8 1.8 junit junit 4.11 test log4j log4j 1.2.17 org.apache.hadoop hadoop-hdfs 2.7.1 org.apache.hadoop hadoop-common 2.7.1 org.apache.hadoop hadoop-client 2.7.1 org.apache.hadoop hadoop-mapreduce-client-core 2.7.1 com.janeluo ikanalyzer 2012_u6 maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 maven-surefire-plugin 2.22.1 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0
3.项目中添加resource文件夹
添加配置文件:core-site.xml、hdfs-site.xml、mapred-site.xml
修改文件夹文件结构为resource
采用”哈利波特“英文版作为数据
1.WordCountJob.class
package com.hdtrain;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
// 定义wordcount任务
public class WordCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//System.setProperty("HADOOP_USER_NAME", "root");
//读取配置文件
Configuration configuration = new Configuration(true);
configuration.set("mapreduce.framework.name", "local");
//创建job
Job job = Job.getInstance(configuration);
//设置Job的参数
job.setJobName("wordcount-" + System.currentTimeMillis()); //设置job名
job.setJarByClass(WordCountJob.class); //设置当前job主类
job.setNumReduceTasks(2);
//设置要处理文件的路径
FileInputFormat.setInputPaths(job, "/data/harry.txt");
//设置输出结果路径
FileOutputFormat.setOutputPath(job, new Path("/results/wordcount-"+System.currentTimeMillis()));
//设置map要输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置map类
job.setMapperClass(WordCountMapper.class);
//设置reduce类
job.setReducerClass(WordCountReduce.class);
//提交任务
job.waitForCompletion(true);
}
}
2.WordCountMapper.class
package com.hdtrain; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WordCountMapper extends Mapper{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取单词 String[] words = value.toString().replaceAll("[^a-zA-Z0-9\s']", "").split(" "); 开始向context添加数据,写出到reduce,统计单词数量 for (int i = 0; i < words.length; i++){ context.write(new Text(words[i]), new IntWritable(1)); } } }
3.WordCountReducer.class
package com.hdtrain; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.Iterator; public class WordCountReducer extends Reducer{ @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { //声明一个变量存放总数 long count = 0; //获取迭代器 Iterator iterator = values.iterator(); //开始遍历迭代器 while (iterator.hasNext()){ int value = iterator.next().get(); count += value; } //继续写出 context.write(key, new LongWritable(count)); } }
4.计算结果
采用”哈利波特“英文版作为数据
1.SdxyzJob.class
package com.hdtrain;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import sun.net.sdp.SdpSupport;
import java.io.IOException;
public class SdyxzJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.读取配置文件
Configuration configuration = new Configuration(true);
configuration.set("mapreduce.framework.name", "local");
//2.创建job
Job job = Job.getInstance(configuration);
//3.设置job的参数
job.setJobName("射雕英雄传-"+System.currentTimeMillis());
job.setJarByClass(SdyxzJob.class);
job.setNumReduceTasks(2);
//4.设置要处理数据文件的路径
FileInputFormat.setInputPaths(job, new Path("/data/sdyxz.txt"));
//5.设置输出结果路径
FileOutputFormat.setOutputPath(job, new Path("/results/sdyxz-"+System.currentTimeMillis()));
//6.设置map要输出的数据类型
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//7.设置map类
job.setMapperClass(SdyxzMapper.class);
//8.设置reduce类
job.setReducerClass(SdyxzReducer.class);
//9.提交job
job.waitForCompletion(true);
}
}
2.SdxyzMapper.class
package com.hdtrain; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.IOException; import java.io.StringReader; public class SdyxzMapper extends Mapper{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringReader stringReader = new StringReader(value.toString()); IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true); Lexeme lexeme = null; while((lexeme = ikSegmenter.next()) != null){ context.write(new Text(lexeme.getLexemeText()), new IntWritable(1)); } } }
3.SdxyzReducer.class
package com.hdtrain; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.Iterator; public class SdyxzReducer extends Reducer{ @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { long count = 0; Iterator iterator = values.iterator(); while(iterator.hasNext()){ int value = iterator.next().get(); count += value; } context.write(key, new LongWritable(count)); } }
4.IK分词器示例
package com.hdtrain;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
public class IKword {
public static void main(String[] args) throws IOException {
StringReader stringReader = new StringReader("畔一排数十株乌柏树,叶子似火烧般红,正是八月天时。村前村后的野草刚起始变黄,一抹斜阳映照之下,更增了几分萧索。两株大松树下围着一堆村民,男男女女和十几个小孩,正自聚精会神的听着一个瘦削的老者说话。");
IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
Lexeme lexeme = null;
while((lexeme = ikSegmenter.next()) != null){
System.out.println(lexeme.getLexemeText());
}
}
}
5.计算结果



