csv文件求某个平均数据
查询每个部门的平均工资,最后输出
数据处理过程
employee_noheader.csv(没做关于首行的处理,运行时请自行删除)
EmployeeID,EmployeeName,DepartmentID,Salary
1,ZhangSan,101,5000
2,LiSi,102,6000
3,WangWu,101,5500
4,ZhaoLiu,103,7000
5,SunQi,102,6500
- pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hadoop</groupId>
<artifactId>Mapreduce_csv_average</artifactId>
<version>1.0-SNAPSHOT</version>
<name>Mapreduce_csv_average</name>
<description>wunaiieq</description>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!--版本控制-->
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-streaming</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<!--构建配置-->
<build>
<plugins>
<plugin>
<!--声明-->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<!--具体配置-->
<configuration>
<archive>
<manifest>
<!--jar包的执行入口-->
<mainClass>com.hadoop.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
允许自定义生成jar文件内容-->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--执行配置-->
<executions>
<execution>
<!--执行配置ID,可修改-->
<id>make-assembly</id>
<!--执行的生命周期-->
<phase>package</phase>
<goals>
<!--执行的目标,single表示创建一个分发包-->
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- Map_1
package com.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class Map_1 extends Mapper<LongWritable, Text,IntWritable,IntWritable> {
@Override
protected void map(LongWritable k1, Text v1, Context context)
throws IOException, InterruptedException {
//处理输入数据,类型转换
//以 1,ZhangSan,101,5000 为例
String data =v1.toString();
//分词操作,csv用","进行分割
//一般而言,分词操作大多使用String进行获取,后面可以附跟类型转换
String[] words =data.split(",");
//下文输出
context.write(
//K2:部门号输出
new IntWritable(Integer.parseInt(words[2])),
//K3:工资输出
new IntWritable(Integer.parseInt(words[3]))
);
}
}
- Reduce_1
package com.hadoop;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.IntWritable;
import java.io.IOException;
public class Reduce_1 extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
@Override
protected void reduce(IntWritable k3, Iterable<IntWritable> v3, Context context)
throws IOException, InterruptedException {
//对v3进行求和,计算总额
int total=0;
int i=0;
for (IntWritable v:v3){
total+= v.get();
i++;
}
int average=total/i;
context.write(k3,new IntWritable(average));
}
}
- Main
package com.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(Main.class);
//map
job.setMapperClass(Map_1.class);
job.setMapOutputKeyClass(IntWritable.class);//k2
job.setMapOutputValueClass(IntWritable.class);//v2
//reduce
job.setReducerClass(Reduce_1.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//输入和输出
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//执行
job.waitForCompletion(true);
}
}
- 运行
请自行上传至hdfs中
hadoop jar Mapreduce_average.jar /input/employee_noheader.csv /output/csv_average
- 效果
hdfs dfs -cat /output/csv_average/part-r-00000