MyReduce经典习题练习

时间：2023-06-17

实现过程所需的java代码 1，在hdfs目录/tmp/input/wordcount中有一系列文件，内容均为","号分隔，求按","号分隔的各个元素的出现频率，输出到目录/tmp/个人用户名的hdfs目录中。

实现代码：

package com.tledu.hadoop.mr.homework;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.StringTokenizer;public class HomeWork1 { public static class HomeWork1Mapper extends Mapper { Text word = new Text(); IntWritable one = new IntWritable(1); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer str = new StringTokenizer(value.toString(), ","); while (str.hasMoreTokens()) { word.set(str.nextToken()); context.write(word, one); } } } public static class HomeWork1Reducer extends Reducer { IntWritable sumRes = new IntWritable(); @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } sumRes.set(sum); context.write(key, sumRes); } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 1、创建配置 Configuration conf = new Configuration(); // 2、创建任务 Job job = Job.getInstance(conf, "homework1"); // 3、设置对应类 job.setJarByClass(HomeWork1.class); job.setMapperClass(HomeWork1Mapper.class); job.setCombinerClass(HomeWork1Reducer.class); job.setReducerClass(HomeWork1Reducer.class); // 4、输出的kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 5、输入和输出的路径 // 获取所有输入的txt文件 List fileList = getTxtFileListFromPath(args[0]); for (String filePath : fileList) { FileInputFormat.addInputPath(job, new Path(filePath)); } FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6、启动任务 System.exit(job.waitForCompletion(true)?0:1); } public static List getTxtFileListFromPath(String folderPath) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(folderPath); // 1、获取文件夹下所有的文件 FileStatus[] statuses = fs.listStatus(path); // 对应的txt文件的地址列表 List list = new ArrayList<>(); for (FileStatus fileStatus : statuses) { // 获取到对应的文件 Path file = fileStatus.getPath(); // 读取的是txt文件 if (fileStatus.isFile() && file.getName().endsWith(".txt")) { list.add(file.toString()); } // 如果是文件夹 else if (fileStatus.isDirectory()) { list.addAll(getTxtFileListFromPath(file.toString())); } } return list; }}

2、在hdfs目录/tmp/input/wordcount目录中有一系列文件，内容为","号分隔，分隔后的元素均为数值类型、字母、中文，求所有出现的数值的平均值

实现代码：

package com.tledu.hadoop.mr.homework;import com.tledu.hadoop.utils.RegUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.StringTokenizer;public class HomeWork3 { public static class HomeWork3Mapper extends Mapper { Text word = new Text("sum="); DoubleWritable number = new DoubleWritable(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String path = ((FileSplit)context.getInputSplit()).getPath().toString();//最后一个斜线的位置 int index = path.lastIndexOf("/");//得到输入文件的文件名 String fileName = path.substring(index+1); StringTokenizer str = new StringTokenizer(value.toString(), ","); while (str.hasMoreTokens()) { String text = str.nextToken(); if (RegUtils.isNumber(text)) { number.set(Double.parseDouble(text)); context.write(word, number); } } } } public static class HomeWork3Reducer extends Reducer { DoubleWritable sumRes = new DoubleWritable(); @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { double sum = 0; int count = 0; for (DoubleWritable val : values) { sum += val.get(); count++; } System.out.println(sum / count); sumRes.set(sum / count); context.write(key, sumRes); } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 1、创建配置 Configuration conf = new Configuration(); // 2、创建任务 Job job = Job.getInstance(conf, "homework2"); // 3、设置对应类 job.setJarByClass(HomeWork3.class); job.setMapperClass(HomeWork3Mapper.class);// job.setCombinerClass(HomeWork3Reducer.class); job.setReducerClass(HomeWork3Reducer.class); // 4、输出的kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); // 5、输入和输出的路径 // 获取所有输入的txt文件 List fileList = getTxtFileListFromPath(args[0]); for (String filePath : fileList) { FileInputFormat.addInputPath(job, new Path(filePath)); } FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6、启动任务 System.exit(job.waitForCompletion(true)?0:1); } public static List getTxtFileListFromPath(String folderPath) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(folderPath); // 1、获取文件夹下所有的文件 FileStatus[] statuses = fs.listStatus(path); // 对应的txt文件的地址列表 List list = new ArrayList<>(); for (FileStatus fileStatus : statuses) { // 获取到对应的文件 Path file = fileStatus.getPath(); // 读取的是txt文件 if (fileStatus.isFile() && file.getName().endsWith(".txt")) { list.add(file.toString()); } // 如果是文件夹 else if (fileStatus.isDirectory()) { list.addAll(getTxtFileListFromPath(file.toString())); } } return list; }}

3，在hdfs目录/tmp/input/wordcount目录中有一系列文件，内容为","号分隔，同时在hdfs路径/tmp/black.txt黑名单文件，一行一个单词用于存放不记入统计的单词列表。求按","号分隔的各个元素去除掉黑名单后的出现频率，输出到目录/tmp/output/个人用户名的hdfs目录中。

实现代码：

package com.tledu.hadoop.mr.homework;import com.tledu.hadoop.utils.RegUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.StringTokenizer;public class HomeWork5 { public static class HomeWork5Mapper extends Mapper { Text word = new Text(); IntWritable one = new IntWritable(1); IntWritable blackVal = new IntWritable(0); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String path = ((FileSplit) context.getInputSplit()).getPath().toString(); int index = path.lastIndexOf("/"); //得到输入文件的文件名 String fileName = path.substring(index+1); if (fileName.contains("blacklist")) { // 是黑名单的内容 word.set(value.toString()); context.write(word,blackVal); }else { StringTokenizer str = new StringTokenizer(value.toString(), ","); while (str.hasMoreTokens()) { word.set(str.nextToken()); context.write(word, one); } } } } public static class HomeWork5Reducer extends Reducer { IntWritable sumRes = new IntWritable(); @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { int sum = 0; boolean isBlack = false; for (IntWritable val : values) { // 有一项等于0了，说明它属于黑名单 if (val.get() == 0) { // 这是黑名单 isBlack = true; break; } sum += val.get(); } sumRes.set(sum); if (!isBlack) { context.write(key, sumRes); } } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 1、创建配置 Configuration conf = new Configuration(); GenericOptionsParser optionParser = new GenericOptionsParser(conf, args); String[] remainingArgs = optionParser.getRemainingArgs(); // 2、创建任务 Job job = Job.getInstance(conf, "homework3"); // 3、设置对应类 job.setJarByClass(HomeWork5.class); job.setMapperClass(HomeWork5Mapper.class);// job.setCombinerClass(HomeWork5Reducer.class); job.setReducerClass(HomeWork5Reducer.class); // 4、输出的kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 5、输入和输出的路径 // 获取所有输入的txt文件 List fileList = getTxtFileListFromPath(remainingArgs[0]); for (String filePath : fileList) { FileInputFormat.addInputPath(job, new Path(filePath)); } FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1])); // 6、启动任务 System.exit(job.waitForCompletion(true)?0:1); } public static List getTxtFileListFromPath(String folderPath) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(folderPath); // 1、获取文件夹下所有的文件 FileStatus[] statuses = fs.listStatus(path); // 对应的txt文件的地址列表 List list = new ArrayList<>(); for (FileStatus fileStatus : statuses) { // 获取到对应的文件 Path file = fileStatus.getPath(); // 读取的是txt文件 if (fileStatus.isFile() && file.getName().endsWith(".txt")) { list.add(file.toString()); } // 如果是文件夹 else if (fileStatus.isDirectory()) { list.addAll(getTxtFileListFromPath(file.toString())); } } return list; }}

上一篇：SpringCloud精简入门（注册中心Eureka、负载均衡、熔断器、远程调用、网关）

下一篇：中台数据质量