分区
将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象,之后按每个员工的不同部门存储
- pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hadoop</groupId>
<artifactId>Mapreduce_partition</artifactId>
<version>1.0-SNAPSHOT</version>
<name>Mapreduce_partition</name>
<description>wunaiieq</description>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!--版本控制-->
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-streaming</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<!--构建配置-->
<build>
<plugins>
<plugin>
<!--声明-->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<!--具体配置-->
<configuration>
<archive>
<manifest>
<!--jar包的执行入口-->
<mainClass>com.hadoop.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
允许自定义生成jar文件内容-->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--执行配置-->
<executions>
<execution>
<!--执行配置ID,可修改-->
<id>make-assembly</id>
<!--执行的生命周期-->
<phase>package</phase>
<goals>
<!--执行的目标,single表示创建一个分发包-->
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- main
package com.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(Main.class);
//map
job.setMapperClass(Map_1.class);
job.setMapOutputKeyClass(IntWritable.class);//k2
job.setMapOutputValueClass(Employee.class);//v2
//指定分区规则
job.setPartitionerClass(partition.class);
//分区个数,此处的形参3传递给partition中的num
job.setNumReduceTasks(3);
//Reduce
job.setReducerClass(Reduce_1.class);
//输出
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Employee.class);
//输入和输出
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//执行
job.waitForCompletion(true);
}
}
- Map_1
package com.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
@Override
protected void map(LongWritable k1, Text v1, Context context)
throws IOException, InterruptedException {
//获取数据
String data = v1.toString();
//分词
String[] words =data.split(",");
Employee e=new Employee();
//设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
e.setId(Integer.parseInt(words[0]));
e.setName(words[1]);
e.setDepartment_id(Integer.parseInt(words[2]));
e.setSalary(Integer.parseInt(words[3]));
context.write(new IntWritable(e.getDepartment_id()),e);
}
}
- Reduce_1
package com.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class Reduce_1 extends Reducer<IntWritable,Employee,IntWritable,Employee> {
@Override
protected void reduce(IntWritable k3, Iterable<Employee> v3,Context context)
throws IOException, InterruptedException {
for (Employee e:v3){
context.write(k3,e);
}
}
}
- partition
package com.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
//分区规则,根据map输出
public class partition extends Partitioner<IntWritable,Employee> {
//k2,v2,分区个数
@Override
public int getPartition(IntWritable k2, Employee v2, int num) {
int department_Id= v2.getDepartment_id();
//按照部门号存储不同分区
if (department_Id==101){
return 1%num;
}else if (department_Id ==102){
return 2%num;
}else {
return 3%num;
}
}
}
- 效果
输出日志,显示4个输出文件
dfs输出的文件目录
存储效果