MapReduce高阶训练
一、上网流量统计
数据格式如下:
二、需求:统计求和
统计每个手机号的上行数据包数总和,下行数据包数总和,上行总流量之和,下行总流量之和
分析:以手机号码作为key值,上行数据包,下行数据包,上行总流量,下行总流量四个字段作为value值,然后以这个key和value作为map阶段的输出,reduce阶段的输入。
1、思路分析
2、代码实现
第一步:自定义map的输出value对象FlowBean
代码语言:javascript复制import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//如果MR中的JavaBean不是作为K2,则只需要实现Writable接口即可
public class FlowBean implements Writable{
private Integer upFlow; //上行包数
private Integer downFlow; //下行包数
private Integer upCountFlow; //上行流量和
private Integer downCountFlow; //下行流量和
public FlowBean() {
}
public FlowBean(Integer upFlow, Integer downFlow, Integer upCountFlow, Integer downCountFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.upCountFlow = upCountFlow;
this.downCountFlow = downCountFlow;
}
public Integer getUpFlow() {
return upFlow;
}
@Override
public String toString() {
return upFlow
"t" downFlow
"t" upCountFlow
"t" downCountFlow;
}
public void setUpFlow(Integer upFlow) {
this.upFlow = upFlow;
}
public Integer getDownFlow() {
return downFlow;
}
public void setDownFlow(Integer downFlow) {
this.downFlow = downFlow;
}
public Integer getUpCountFlow() {
return upCountFlow;
}
public void setUpCountFlow(Integer upCountFlow) {
this.upCountFlow = upCountFlow;
}
public Integer getDownCountFlow() {
return downCountFlow;
}
public void setDownCountFlow(Integer downCountFlow) {
this.downCountFlow = downCountFlow;
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeInt(downFlow);
out.writeInt(upCountFlow);
out.writeInt(downCountFlow);
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readInt();
this.downFlow = in.readInt();
this.upCountFlow = in.readInt();
this.downCountFlow = in.readInt();
}
}
第二步:定义FlowMapper类
代码语言:javascript复制import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountMapper extends Mapper<LongWritable, Text,Text,FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1:拆分行文本数据(拆分v1)
String[] split = value.toString().split("t");
//2:从拆分数组中得到手机号,得到K2
String phoneNum = split[1];
//3:从拆分数组中得到4个流量字段,并封装到FlowBean,得到V2
FlowBean flowBean = new FlowBean();
flowBean.setUpFlow(Integer.parseInt(split[6]));
flowBean.setDownFlow(Integer.parseInt(split[7]));
flowBean.setUpCountFlow(Integer.parseInt(split[8]));
flowBean.setDownCountFlow(Integer.parseInt(split[9]));
//4:将K2和V2写入上下文中
context.write(new Text(phoneNum), flowBean);
}
}
第三步:定义FlowCountReducer类
代码语言:javascript复制import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
//1:定义四个变量,分别来存储上行包数,下行包数,上行流量总和,下行流量总和
int upFlow = 0;
int downFlow = 0;
int upCountFlow = 0;
int downCountFlow = 0;
//2:遍历集合,将集合中每一个FlowBean的四个流量字段相加
for (FlowBean flowBean : values) {
upFlow = flowBean.getUpFlow();
downFlow = flowBean.getDownFlow();
upCountFlow = flowBean.getUpCountFlow();
downCountFlow = flowBean.getDownCountFlow();
}
//3:K3就是原来的K2,V3就是新的FlowBean
FlowBean flowBean = new FlowBean(upFlow, downFlow, upCountFlow, downCountFlow);
//4:将K3和V3写入上下文中
context.write(key, flowBean);
}
}
第四步:程序main函数入口FlowCountRunner
代码语言:javascript复制import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class FlowCountRunner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1、创建建一个job任务对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration, "flowcount_demo");
//2、指定job所在的jar包
job.setJarByClass(FlowCountRunner.class);
//3、指定源文件的读取方式类和源文件的读取路径
job.setInputFormatClass(TextInputFormat.class); //按照行读取
//TextInputFormat.addInputPath(job, new Path("hdfs://node1:8020/input/wordcount")); //只需要指定源文件所在的目录即可
TextInputFormat.addInputPath(job, new Path("file:///E:\input\flowcount")); //只需要指定源文件所在的目录即可
//4、指定自定义的Mapper类和K2、V2类型
job.setMapperClass(FlowCountMapper.class); //指定Mapper类
job.setMapOutputKeyClass(Text.class); //K2类型
job.setMapOutputValueClass(FlowBean.class);//V2类型
//5、指定自定义分区类(如果有的话)
//6、指定自定义分组类(如果有的话)
//7、指定自定义Combiner类(如果有的话)
//设置ReduceTask个数
//8、指定自定义的Reducer类和K3、V3的数据类型
job.setReducerClass(FlowCountReducer.class); //指定Reducer类
job.setOutputKeyClass(Text.class); //K3类型
job.setOutputValueClass(FlowBean.class); //V3类型
//9、指定输出方式类和结果输出路径
job.setOutputFormatClass(TextOutputFormat.class);
//TextOutputFormat.setOutputPath(job, new Path("hdfs://node1:8020/output/wordcount")); //目标目录不能存在,否则报错
TextOutputFormat.setOutputPath(job, new Path("file:///E:\output\flowcount")); //目标目录不能存在,否则报错
//10、将job提交到yarn集群
boolean bl = job.waitForCompletion(true); //true表示可以看到任务的执行进度
//11.退出执行进程
System.exit(bl?0:1);
}
}