使用序列化封装对象
将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象
数据处理过程
employee_noheader.csv
1,ZhangSan,101,5000
2,LiSi,102,6000
3,WangWu,101,5500
4,ZhaoLiu,103,7000
5,SunQi,102,6500
- pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hadoop</groupId>
<artifactId>Mapreduce_Writable</artifactId>
<version>1.0-SNAPSHOT</version>
<name>Mapreduce_Writable</name>
<description>wunaiieq</description>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!--版本控制-->
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-streaming</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<!--构建配置-->
<build>
<plugins>
<plugin>
<!--声明-->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<!--具体配置-->
<configuration>
<archive>
<manifest>
<!--jar包的执行入口-->
<mainClass>com.hadoop.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
允许自定义生成jar文件内容-->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--执行配置-->
<executions>
<execution>
<!--执行配置ID,可修改-->
<id>make-assembly</id>
<!--执行的生命周期-->
<phase>package</phase>
<goals>
<!--执行的目标,single表示创建一个分发包-->
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- main
package com.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(Main.class);
//map
job.setMapperClass(Map_1.class);
job.setMapOutputKeyClass(IntWritable.class);//k2
job.setMapOutputValueClass(Employee.class);//v2
//任务输出
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Employee.class);
//输入和输出
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//执行
job.waitForCompletion(true);
}
}
- Map_1
package com.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
@Override
protected void map(LongWritable k1, Text v1, Context context)
throws IOException, InterruptedException {
//获取数据
String data = v1.toString();
//分词
String[] words =data.split(",");
Employee e=new Employee();
//设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
e.setId(Integer.parseInt(words[0]));
e.setName(words[1]);
e.setDepartment_id(Integer.parseInt(words[2]));
e.setSalary(Integer.parseInt(words[3]));
context.write(new IntWritable(e.getId()),e);
}
}
- Employee
package com.hadoop;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//1,ZhangSan,101,5000
public class Employee implements Writable {
private int id;
private String name;
private int department_id;
private int salary;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getDepartment_id() {
return department_id;
}
public void setDepartment_id(int department_id) {
this.department_id = department_id;
}
public int getSalary() {
return salary;
}
public void setSalary(int salary) {
this.salary = salary;
}
//toString方法,用于查看
@Override
public String toString() {
return "Employee{" +
"id=" + id +
", name='" + name + '\'' +
", department_id=" + department_id +
", salary=" + salary +
'}';
}
//序列化和反序列化过程需要保持一致
@Override
public void write(DataOutput output) throws IOException {
//实现序列化的过程,输出到文件
output.writeInt(this.id);
output.writeUTF(this.name);
output.writeInt(this.department_id);
output.writeInt(this.salary);
}
@Override
public void readFields(DataInput input) throws IOException {
//实现反序列化的过程,从文件读取
this.id=input.readInt();
this.name=input.readUTF();
this.department_id=input.readInt();
this.salary=input.readInt();
}
}
- 打jar包+部署+运行(部分内容已省略)
运行
hadoop jar Mapreduce_Writable.jar /input/employee_noheader.csv /output/Writable
- 效果