【大数据笔记】- Hadoop MapReduce API

一.基础环境：

本文默认了你已经有一点的java基础，本机环境已安装java、maven、ide，配置好了相关的环境变量，且已经有可用的hadoop环境，已经用idea新建一个java maven项目。还要有一台linux客户机，可执行hadoop命令的。

以上环境有没完成的，自行去百度完成。

二.pom.xml引入包：

      
          org.apache.hadoop
          hadoop-common
          2.7.3
      
      
          org.apache.hadoop
          hadoop-hdfs
          2.7.3
      
      
          org.apache.hadoop
          hadoop-client
          2.7.3

三.准备统计文件并上传 1.新建一个文件word_test.txt

I have searched a thousand years，And I have cried a thousand tears。
I found everything I need，You are everything to me。

2.上传到hadoop

先 rz 上传到linux客户机，再执行下边命令上传到hdfs

hadoop fs -mkdir /tmp/mr_test/
hadoop fs -put ./word_test.txt /tmp/mr_test/

四.上代码(官方WordCount V1)

package com.yixin;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class WordCount {

    public static class TokenizerMapper
            extends Mapper {

        private final static IntWritable one = new IntWritable(1);
        private final Text word = new Text();

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer {
        private final IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

五.打包上传执行 1.用ide或maven打包代码成jar，

2.rz上传到linux客户机

3.执行代码

hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount /tmp/mr_test/word_test.txt /tmp/mr_test/output

4.查看结果

hadoop fs -cat /tmp/mr_test/output

    public static class TokenizerMapper
            extends Mapper {

        

        // 这就是个全局计数器，各map是可共享的，修改可见的。
        static enum CountersEnum {INPUT_WORDS}

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        private boolean caseSensitive;
        private Set patternsToSkip = new HashSet();

        private Configuration conf;
        private BufferedReader fis;

        
        @Override
        public void setup(Context context) throws IOException,
                InterruptedException {
            conf = context.getConfiguration();
            caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
            if (conf.getBoolean("wordcount.skip.patterns", true)) {
                URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
                if(patternsURIs!=null){
                    for (URI patternsURI : patternsURIs) {
                        Path patternsPath = new Path(patternsURI.getPath());
                        String patternsFileName = patternsPath.getName().toString();
                        parseSkipFile(patternsFileName);
                    }
                }
            }
        }

        private void parseSkipFile(String fileName) {
            try {
                fis = new BufferedReader(new FileReader(fileName));
                String pattern = null;
                while ((pattern = fis.readLine()) != null) {
                    patternsToSkip.add(pattern);
                }
            } catch (IOException ioe) {
                System.err.println("Caught exception while parsing the cached file '"
                        + StringUtils.stringifyException(ioe));
            }
        }

        @Override
        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            String line = (caseSensitive) ?
                    value.toString() : value.toString().toLowerCase();
            for (String pattern : patternsToSkip) {
                line = line.replaceAll(pattern, "");
            }
            StringTokenizer itr = new StringTokenizer(line);
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
                Counter counter = context.getCounter(CountersEnum.class.getName(),
                        CountersEnum.INPUT_WORDS.toString());
                counter.increment(1);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
        String[] remainingArgs = optionParser.getRemainingArgs();
        if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
            System.err.println("Usage: wordcount   [-skip skipPatternFile]");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount2.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        List otherArgs = new ArrayList();
        for (int i = 0; i < remainingArgs.length; ++i) {
            if ("-skip".equals(remainingArgs[i])) {
                job.addCacheFile(new Path(remainingArgs[++i]).toUri());
                job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
            } else {
                otherArgs.add(remainingArgs[i]);
            }
        }
        FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

2.运行方式a，和v1参数一样

打包上传和v1一样不说了，运行是如下命令：

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output

结果：

hadoop fs -cat /tmp/mr_test/output/*

3.新建文件patterns.txt（跳过字符的规则）,并上传到集群。

新建，编辑，保存

vim patterns.txt

.
,
!
to
，
。

上传：

hadoop fs -put patterns.txt /tmp/mr_test/

运行方法b，多加2个参数，跳过不需要统计的字符

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output -skip /tmp/mr_test/patterns.txt

结果：

hadoop fs -cat /tmp/mr_test/output/*

【大数据笔记】- Hadoop MapReduce API

大数据系统相关栏目本月热门文章