分布式运算程序
特点:
- 良好扩展性
- 高容错性
- PB级以上的离线处理
缺点:
- 不擅长实时计算
- 不擅长流式计算
- 不擅长DAG(有向无环图)计算
- MrAppMaster:负责整个程序的过程调度及状态调度
- MapTask:负责Map阶段的整个数据处理流程
- ReduceTask:负责Reduce阶段的整个数据处理流程
词频统计
// Map类,继承于org.apache.hadoop.mapreduce.Mapper; public class WordCountMap extends Mapper{ Text word = new Text(); IntWritable value = new IntWritable(1); @Override protected void map(LongWritable key, Text t, Context context) throws IOException, InterruptedException { // 切分每一行的数据 String[] words = t.toString().split(" "); // 循环输出 for (String w : words){ word.set(w); context.write(word,value); } } }
// Reduce类,继承于org.apache.hadoop.mapreduce.Reducer public class WordCountReduce extends Reducer{ int sum; IntWritable count = new IntWritable(); @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { sum = 0; // 累加求和 for (IntWritable v: values){ sum += v.get(); } count.set(sum); context.write(key,count); } }
public class WordCountDriver {
public static void main(String[] args) throws Exception {
// 设置操作的用户
System.setProperty("HADOOP_USER_NAME","root");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 配置Driver类
job.setJarByClass(WordCountDriver.class);
// 配置Map,Reduce类
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
// 配置Map的输出Key类型及Value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 配置Reduce的输出Key类型及Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入路径以及输出路径
FileInputFormat.setInputPaths(job,new Path("hdfs://192.168.19.16:9000/mol/test.txt"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.19.16:9000/mol/wordcount"));
// 启动任务
Boolean result = job.waitForCompletion(true);
System.exit(result ? 0:1);
}
}



