感谢段海涛老师
FlowBean.java通用的
package club.drguo.mapreduce.flowcount; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; //实现序列化接口 public class FlowBean implements WritableComparable<FlowBean>{ //手机号 private String phoneNum; //上传流量 private long up_flow; //下载流量 private long down_flow; //总流量 private long sum_flow; public void set(String phoneNum, long up_flow, long down_flow){ this.phoneNum = phoneNum; this.up_flow = up_flow; this.down_flow = down_flow; this.sum_flow = up_flow + down_flow; } /** * 序列化,将数据字段以字节流写出去 */ @Override public void write(DataOutput out) throws IOException { out.writeUTF(phoneNum); out.writeLong(up_flow); out.writeLong(down_flow); out.writeLong(sum_flow); } /** * 反序列化,从字节流中读出各个数据字段 * 读写顺序,数据类型应一致 */ @Override public void readFields(DataInput in) throws IOException { phoneNum = in.readUTF(); up_flow = in.readLong(); down_flow = in.readLong(); sum_flow = in.readLong(); } public String getPhoneNum() { return phoneNum; } public void setPhoneNum(String phoneNum) { this.phoneNum = phoneNum; } public long getUp_flow() { return up_flow; } public void setUp_flow(long up_flow) { this.up_flow = up_flow; } public long getDown_flow() { return down_flow; } public void setDown_flow(long down_flow) { this.down_flow = down_flow; } public long getSum_flow() { return sum_flow; } public void setSum_flow(long sum_flow) { this.sum_flow = sum_flow; } //不写结果会出问题 @Override public String toString() { return up_flow + "\t" + down_flow + "\t" + sum_flow; } //比较排序(总流量) @Override public int compareTo(FlowBean o) { return this.sum_flow > o.getSum_flow()?-1:1; } }FlowCount.java这次把map reduce都写到一个类里了
package club.drguo.mapreduce.flowcount; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; //club.drguo.mapreduce.flowcount.FlowCount public class FlowCount { public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> { // 减少内存占用(如果放下面,GC机制会使对象越积越多) private FlowBean flowBean = new FlowBean(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException { try { // 拿到一行数据 String line = value.toString(); // 切分字段 String[] strings = StringUtils.split(line, "\t"); // 拿到我们需要的若干个字段 String phoneNum = strings[1]; long up_flow = Long.parseLong(strings[strings.length - 3]); long down_flow = Long.parseLong(strings[strings.length - 2]); // 将数据封装到一个flowbean中 flowBean.set(phoneNum, up_flow, down_flow); // 以手机号为key,将流量数据输出去 context.write(new Text(phoneNum), flowBean); } catch (Exception e) { System.out.println("-----------------mapper出现问题"); } } } public static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> { // 减少内存占用(如果放下面,GC机制会使对象越积越多) private FlowBean flowBean = new FlowBean(); @Override protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException { long up_flow_sum = 0; long down_flow_sum = 0; for (FlowBean bean : values) { up_flow_sum += bean.getUp_flow(); down_flow_sum += bean.getDown_flow(); } flowBean.set(key.toString(), up_flow_sum, down_flow_sum); context.write(key, flowBean); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); // configuration.set("mapreduce.job.jar", "flowcount.jar"); Job job = Job.getInstance(configuration, "flowjob"); job.setJarByClass(FlowCount.class); job.setMapperClass(FlowCountMapper.class); job.setReducerClass(FlowCountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); // 可以不写,默认就是下面的 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //没写死,输命令时自己写 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean b = job.waitForCompletion(true); System.out.println(b ? "完成" : "未完成"); } }自定义排序
FlowCountSort.java(注意这个是在上面处理得到的结果上按流量总数降序排列的)
package club.drguo.mapreduce.flowcount; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; //club.drguo.mapreduce.flowcount.FlowCountSort public class FlowCountSort { public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{ private FlowBean bean = new FlowBean(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, FlowBean, NullWritable>.Context context) throws IOException, InterruptedException { String line = value.toString(); String[] strings = StringUtils.split(line, "\t"); String phoneNum = strings[0]; long up_flow = Long.parseLong(strings[1]); long down_flow = Long.parseLong(strings[2]); bean.set(phoneNum, up_flow, down_flow); context.write(bean, NullWritable.get()); } } public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{ @Override protected void reduce(FlowBean bean, Iterable<NullWritable> values, Reducer<FlowBean, NullWritable, Text, FlowBean>.Context context) throws IOException, InterruptedException { context.write(new Text(bean.getPhoneNum()), bean); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration, "sortjob"); job.setJarByClass(FlowCountSort.class); job.setMapperClass(FlowCountSortMapper.class); job.setReducerClass(FlowCountSortReducer.class); job.setMapOutputKeyClass(FlowBean.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); // 可以不写,默认就是下面的 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //没写死,输命令时自己写 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean b = job.waitForCompletion(true); System.out.println(b ? "完成" : "未完成"); } }自定义分区
FlowBean.java通用的
AreaPartitioner.java
package club.drguo.mapreduce.partition; import java.util.HashMap; import org.apache.hadoop.mapreduce.Partitioner; public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> { //手机号,地区代码 private static HashMap<String, Integer> areaMap = new HashMap<>(); //静态代码块,将数据先加载到内存中 static{ areaMap.put("134", 0); areaMap.put("135", 1); areaMap.put("137", 2); areaMap.put("138", 3); } @Override public int getPartition(KEY key, VALUE value, int numPartitions) { Integer provinceCode = areaMap.get(key.toString().substring(0,3)); return provinceCode==null?4:provinceCode; } }FlowCountPartition.java
package club.drguo.mapreduce.partition; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import club.drguo.mapreduce.flowcount.FlowBean; //club.drguo.mapreduce.partition.FlowCountPartition public class FlowCountPartition { public static class FlowCountPartitionMapper extends Mapper<LongWritable, Text, Text, FlowBean> { // 减少内存占用(如果放下面,GC机制会使对象越积越多) private FlowBean flowBean = new FlowBean(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException { try { // 拿到一行数据 String line = value.toString(); // 切分字段 String[] strings = StringUtils.split(line, "\t"); // 拿到我们需要的若干个字段 String phoneNum = strings[1]; long up_flow = Long.parseLong(strings[strings.length - 3]); long down_flow = Long.parseLong(strings[strings.length - 2]); // 将数据封装到一个flowbean中 flowBean.set(phoneNum, up_flow, down_flow); // 以手机号为key,将流量数据输出去 context.write(new Text(phoneNum), flowBean); } catch (Exception e) { System.out.println("-----------------mapper出现问题"); } } } // 出去的是手机号和flowbean public static class FlowCountPartitionReducer extends Reducer<Text, FlowBean, Text, FlowBean> { // 减少内存占用(如果放下面,GC机制会使对象越积越多) private FlowBean flowBean = new FlowBean(); @Override protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException { long up_flow_sum = 0; long down_flow_sum = 0; for (FlowBean bean : values) { up_flow_sum += bean.getUp_flow(); down_flow_sum += bean.getDown_flow(); } flowBean.set(key.toString(), up_flow_sum, down_flow_sum); context.write(key, flowBean); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration, "flowpartjob"); job.setJarByClass(FlowCountPartition.class); job.setMapperClass(FlowCountPartitionMapper.class); job.setReducerClass(FlowCountPartitionReducer.class); /** * 加入自定义分区定义:AreaPartitioner */ job.setPartitionerClass(AreaPartitioner.class); /** * 设置reduce task的数量,要跟AreaPartitioner返回的partitioner个数匹配 * 若reduce task多,会产生多余的几个空文件 * 若reduce task少,就会发生异常,因为有一些key没有对应reduce task接收 * 但reduce task数量为1时,不会产生异常,因为所有key都会给这一个reduce task * reduce task和map task指的是reducer和mapper在集群中运行的实例 */ job.setNumReduceTasks(5); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); // 可以不写,默认就是下面的 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //没写死,输命令时自己写 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean b = job.waitForCompletion(true); System.out.println(b ? "完成" : "未完成"); } }源码地址:http://git.oschina.net/drguo/MapReduceDemo