学大数据小胖的第四十八天

解析：输入输出格式化的类

map端处理完会先写到环形缓冲区，100M,80%

溢写磁盘时会分区（哈希分区），排序（快速排序）

合并（归并排序）

//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//

package org.apache.hadoop.mapred.lib;

import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Stable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;

@Public
@Stable
public class HashPartitioner implements Partitioner {
    public HashPartitioner() {
    }

    public void configure(JobConf job) {
    }

    public int getPartition(K2 key, V2 value, int numReduceTasks) {
        return (key.hashCode() & 2147483647) % numReduceTasks;
    }
}

[root@master ~]# rz -E
rz waiting to receive.
[root@master ~]# ls
ac.sh                                        students.txt  文档
anaconda-ks.cfg                              公共          下载
dump.rdb                                     模板          音乐
initial-setup-ks.cfg                         视频          桌面
mysql57-community-release-el7-10.noarch.rpm  图片
[root@master ~]# mv students.txt /usr/local/soft/data/
[root@master ~]# cd /usr/local/soft/data/
[root@master data]# ls
new_db.sql  student.sql   theZenOfPython.txt  wordcount
score.sql   students.txt  theZen.txt          words.txt
[root@master data]# hdfs dfs -mkdir -p  /data/stu/input
[root@master data]# hdfs dfs -put students.txt /data/stu/input
[root@master data]# cd ..
[root@master soft]# cd jars/
[root@master jars]# ls
hadoop-1.0-SNAPSHOT.jar
[root@master jars]# rm hadoop-1.0-SNAPSHOT.jar
rm：是否删除普通文件 "hadoop-1.0-SNAPSHOT.jar"？y
[root@master jars]# rz -E
rz waiting to receive.
[root@master jars]# ls
hadoop-1.0-SNAPSHOT.jar
[root@master jars]# hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo02ClazzCnt

package com.shujia.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo02ClazzCnt {
    //map端
    public static class MyMapper extends Mapper{
        @Override
        protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
          //提取数据
            // 1500100007,尚孤风,23,女,文科六班
            String clazz = value.toString().split(",")[4];
            context.write(new Text(clazz),new IntWritable(1));
        }
    }

    //Reduce端
    public static class MyReducer extends Reducer {
        @Override
        protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException {
            int cnt = 0;
            for (IntWritable value : values) {
                cnt+=value.get();
            }
            context.write(key,new IntWritable(cnt));
        }
    }

    //Driver端
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");
        //创建一个MapReduce的job
        Job job = Job.getInstance(conf);
        //配置任务
        job.setJobName("Demo02ClazzCnt");
        //设置任务运行哪个类
        job.setJarByClass(Demo02ClazzCnt.class);
        //设置Reduce的数量,默认是1，最终生成文件的数量同Reduce的数量一致
        job.setNumReduceTasks(12);
        //使用自定义的分区类
        job.setPartitionerClass(ClassPartitioner.class);

        //配置map端
        //指定map运行时哪一个类
        job.setMapperClass(MyMapper.class);
        //配置Map端输出的key类型
        job.setMapOutputKeyClass(Text.class);
        //配置Map端输出的value类型
        job.setMapOutputValueClass(IntWritable.class);

        //配置Reduce端
        //指定Reduce运行时哪一个类
        job.setReducerClass(MyReducer.class);
        //配置Reduce端输出的key类型
        job.setOutputKeyClass(Text.class);
        //配置Reduce端输出的value类型
        job.setOutputValueClass(IntWritable.class);

        //配置输入输出路径
        
        FileInputFormat.addInputPath(job,new Path("/data/stu/input"));
        Path path = new Path("/data/stu/output");
        FileSystem fs = FileSystem.get(conf);
        //判断输出路径是否存在，存在则删除
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        //输出路径已存在，会报错
        FileOutputFormat.setOutputPath(job,path);

        //等待任务完成
        job.waitForCompletion(true);
    }
    
}
class ClassPartitioner extends Partitioner{

    @Override
    public int getPartition(Text key, IntWritable value, int numReduces) {
        String clazz = key.toString();
        switch (clazz) {
            case "文科一班":
                return 0;
            case "文科二班":
                return 1;
            case "文科三班":
                return 2;
            case "文科四班":
                return 3;
            case "文科五班":
                return 4;
            case "文科六班":
                return 5;
            case "理科一班":
                return 6;
            case "理科二班":
                return 7;
            case "理科三班":
                return 8;
            case "理科四班":
                return 9;
            case "理科五班":
                return 10;
            case "理科六班":
                return 11;
        }
        return 0;
    }
}

[root@master jars]# ls
hadoop-1.0-SNAPSHOT.jar
[root@master jars]# rm hadoop-1.0-SNAPSHOT.jar
rm：是否删除普通文件 "hadoop-1.0-SNAPSHOT.jar"？y
[root@master jars]# ls
[root@master jars]# rz -E
rz waiting to receive.
[root@master jars]# hadoop jar hadoop-1.0-SNAPSHOT.jar com.shujia.MapReduce.Demo02ClazzCnt
22/03/25 21:21:05 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.49.110:8032
22/03/25 21:21:05 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
22/03/25 21:21:06 INFO input.FileInputFormat: Total input paths to process : 1
22/03/25 21:21:06 INFO mapreduce.JobSubmitter: number of splits:1
22/03/25 21:21:07 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1647858149677_0007
22/03/25 21:21:07 INFO impl.YarnClientImpl: Submitted application application_1647858149677_0007
22/03/25 21:21:07 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1647858149677_0007/
22/03/25 21:21:07 INFO mapreduce.Job: Running job: job_1647858149677_0007
22/03/25 21:21:14 INFO mapreduce.Job: Job job_1647858149677_0007 running in uber mode : false
22/03/25 21:21:14 INFO mapreduce.Job:  map 0% reduce 0%
22/03/25 21:21:19 INFO mapreduce.Job:  map 100% reduce 0%
22/03/25 21:21:29 INFO mapreduce.Job:  map 100% reduce 8%
22/03/25 21:21:31 INFO mapreduce.Job:  map 100% reduce 17%
22/03/25 21:21:36 INFO mapreduce.Job:  map 100% reduce 25%
22/03/25 21:21:39 INFO mapreduce.Job:  map 100% reduce 33%
22/03/25 21:21:40 INFO mapreduce.Job:  map 100% reduce 42%
22/03/25 21:21:41 INFO mapreduce.Job:  map 100% reduce 67%
22/03/25 21:21:44 INFO mapreduce.Job:  map 100% reduce 92%
22/03/25 21:21:45 INFO mapreduce.Job:  map 100% reduce 100%
22/03/25 21:21:45 INFO mapreduce.Job: Job job_1647858149677_0007 completed successfully
22/03/25 21:21:46 INFO mapreduce.Job: Counters: 50
	File System Counters
		FILE: Number of bytes read=19072
		FILE: Number of bytes written=1635424
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=42109
		HDFS: Number of bytes written=193
		HDFS: Number of read operations=39
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=24
	Job Counters 
		Killed reduce tasks=1
		Launched map tasks=1
		Launched reduce tasks=12
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=3009
		Total time spent by all reduces in occupied slots (ms)=183989
		Total time spent by all map tasks (ms)=3009
		Total time spent by all reduce tasks (ms)=183989
		Total vcore-milliseconds taken by all map tasks=3009
		Total vcore-milliseconds taken by all reduce tasks=183989
		Total megabyte-milliseconds taken by all map tasks=3081216
		Total megabyte-milliseconds taken by all reduce tasks=188404736
	Map-Reduce framework
		Map input records=1000
		Map output records=1000
		Map output bytes=17000
		Map output materialized bytes=19072
		Input split bytes=111
		Combine input records=0
		Combine output records=0
		Reduce input groups=12
		Reduce shuffle bytes=19072
		Reduce input records=1000
		Reduce output records=12
		Spilled Records=2000
		Shuffled Maps =12
		Failed Shuffles=0
		Merged Map outputs=12
		GC time elapsed (ms)=1383
		CPU time spent (ms)=8230
		Physical memory (bytes) snapshot=1330892800
		Virtual memory (bytes) snapshot=27107094528
		Total committed heap usage (bytes)=324173824
	Shuffle Errors
		BAD_ID=0
		ConNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=41998
	File Output Format Counters 
		Bytes Written=193
[root@master jars]# hdfs dfs -cat /data/stu/output/part-r-00000
文科一班	72
[root@master jars]# hdfs dfs -cat /data/stu/output/part-r-00001
文科二班	87

学大数据小胖的第四十八天

大数据系统相关栏目本月热门文章