栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 前沿技术 > 大数据 > 大数据系统

mapreduce-wordcount2(未完成,不保真)

mapreduce-wordcount2(未完成,不保真)

原版:
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

  public static class TokenizerMapper
       extends Mapper{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }

  public class myPartitioner extends Partitioner{
  	public int getPartition(Text key, IntWritable value, int numPartitions)
	{
		return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;
	}
  }

  public static class IntSumReducer
       extends Reducer {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setPartitionerClass(myPartitioner.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

增加功能之后:(未完待写…)
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

  public static class TokenizerMapper
       extends Mapper{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
      	Text world=itr.nextToken();
      	if(world.charAt(0).match([a-zA-Z])==null){//去除不以英文字母开头的所有词
      		continue;
      	}
        word.set(world);
        context.write(word, one);
      }
    }
  }
  
  //将相同key的所有值相加后乘以2输出
  public static class MyCombiner 
        extends Reducer {
        private IntWritable result = new IntWritable();
        protected void reduce(Text key, Iterable values,
                       Context context
                       ) throws IOException, InterruptedException {
            int count=0,i=0,j;
			for(i=0;i {
		public int getPartition(Text key, IntWritable value, int numPartitions) 
		{
			if(key.charAt(0).match([A-Z])!=null)
				return 0;
			if(key.charAt(0).match([a-z])!=null)
				return 1;
		}
	}

  public static class IntSumReducer
       extends Reducer {
    //private IntWritable result = new IntWritable();
    private Text result =new Text();

    public void reduce(Text key, Iterable values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      Text number="";
      for (IntWritable val : values) {
        sum += val.get(); //get-获取hash的key对应的value?
      }
      if(sum>=3){
      	while(sum--){
      		number=number.concat("+");
      	}
      	result.set(number);
      	context.write(key, result);
      }
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);
    //job.setCombinerClass(IntSumReducer.class);
    job.setCombinerClass(MyCombiner.class);// 设置Map规约Combiner
    job.setReducerClass(IntSumReducer.class);

    job.setPartitionerClass(MyPartitioner.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

  • result.set(sum);
    • 不等同于:result=sum;
    • 在实际编程中,类的许多变量我们是不希望直接被访问的,所以用private关键字修饰 ,但是又要允许它修改,所以就有了public修饰的get()set() .
  • for(:)
  • 对于for(类型名 类型 : 需要遍历的数组),先创建了对象或者变量,然后遍历数组,一个一个赋值给类型
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/304574.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号