[置顶] 第二个MapReduce程序----flowcount(流量统计,自定义排序,自定义分区)

感谢段海涛老师

FlowBean.java通用的

package club.drguo.mapreduce.flowcount;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
//实现序列化接口
public class FlowBean implements WritableComparable<FlowBean>{
	//手机号
	private String phoneNum;
	//上传流量
	private long up_flow;
	//下载流量
	private long down_flow;
	//总流量
	private long sum_flow;
	
	public void set(String phoneNum, long up_flow, long down_flow){
		this.phoneNum = phoneNum;
		this.up_flow = up_flow;
		this.down_flow = down_flow;
		this.sum_flow = up_flow + down_flow;
	}
	/**
	 * 序列化,将数据字段以字节流写出去
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(phoneNum);
		out.writeLong(up_flow);
		out.writeLong(down_flow);
		out.writeLong(sum_flow);
	}
	/**
	 * 反序列化,从字节流中读出各个数据字段
	 * 读写顺序,数据类型应一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		phoneNum = in.readUTF();
		up_flow = in.readLong();
		down_flow = in.readLong();
		sum_flow = in.readLong();
	}
	
	public String getPhoneNum() {
		return phoneNum;
	}
	public void setPhoneNum(String phoneNum) {
		this.phoneNum = phoneNum;
	}
	public long getUp_flow() {
		return up_flow;
	}
	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}
	public long getDown_flow() {
		return down_flow;
	}
	public void setDown_flow(long down_flow) {
		this.down_flow = down_flow;
	}
	public long getSum_flow() {
		return sum_flow;
	}
	public void setSum_flow(long sum_flow) {
		this.sum_flow = sum_flow;
	}
	//不写结果会出问题
	@Override
	public String toString() {
		return up_flow + "\t" + down_flow + "\t" + sum_flow;
	}
	//比较排序(总流量)
	@Override
	public int compareTo(FlowBean o) {
		return this.sum_flow > o.getSum_flow()?-1:1;
	}
	
}
FlowCount.java这次把map reduce都写到一个类里了

package club.drguo.mapreduce.flowcount;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
//club.drguo.mapreduce.flowcount.FlowCount
public class FlowCount {
	public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
		// 减少内存占用(如果放下面,GC机制会使对象越积越多)
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
				throws IOException, InterruptedException {
			try {
				// 拿到一行数据
				String line = value.toString();
				// 切分字段
				String[] strings = StringUtils.split(line, "\t");
				// 拿到我们需要的若干个字段
				String phoneNum = strings[1];
				long up_flow = Long.parseLong(strings[strings.length - 3]);
				long down_flow = Long.parseLong(strings[strings.length - 2]);
				// 将数据封装到一个flowbean中
				flowBean.set(phoneNum, up_flow, down_flow);
				// 以手机号为key,将流量数据输出去
				context.write(new Text(phoneNum), flowBean);
			} catch (Exception e) {
				System.out.println("-----------------mapper出现问题");
			}
		}
	}

	public static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
		// 减少内存占用(如果放下面,GC机制会使对象越积越多)
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,
				Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {
			long up_flow_sum = 0;
			long down_flow_sum = 0;
			for (FlowBean bean : values) {
				up_flow_sum += bean.getUp_flow();
				down_flow_sum += bean.getDown_flow();
			}
			flowBean.set(key.toString(), up_flow_sum, down_flow_sum);
			context.write(key, flowBean);
		}
	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration = new Configuration();
//		configuration.set("mapreduce.job.jar", "flowcount.jar");
		Job job = Job.getInstance(configuration, "flowjob");
		job.setJarByClass(FlowCount.class);

		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		// 可以不写,默认就是下面的
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		//没写死,输命令时自己写
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean b = job.waitForCompletion(true);
		System.out.println(b ? "完成" : "未完成");
	}
}
自定义排序

FlowCountSort.java(注意这个是在上面处理得到的结果上按流量总数降序排列的)

package club.drguo.mapreduce.flowcount;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

//club.drguo.mapreduce.flowcount.FlowCountSort
public class FlowCountSort {
	public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{
		private FlowBean bean = new FlowBean();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, FlowBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] strings = StringUtils.split(line, "\t");
			
			String phoneNum = strings[0];
			long up_flow  = Long.parseLong(strings[1]);
			long down_flow  = Long.parseLong(strings[2]);
			
			bean.set(phoneNum, up_flow, down_flow);
			context.write(bean, NullWritable.get());
		}
	}
	public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{
		@Override
		protected void reduce(FlowBean bean, Iterable<NullWritable> values,
				Reducer<FlowBean, NullWritable, Text, FlowBean>.Context context) throws IOException, InterruptedException {
			context.write(new Text(bean.getPhoneNum()), bean);
		}
	}
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration, "sortjob");
		job.setJarByClass(FlowCountSort.class);

		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);

		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		// 可以不写,默认就是下面的
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		//没写死,输命令时自己写
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean b = job.waitForCompletion(true);
		System.out.println(b ? "完成" : "未完成");
	}
}
自定义分区

FlowBean.java通用的

AreaPartitioner.java

package club.drguo.mapreduce.partition;

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> {
	
	//手机号,地区代码
	private static HashMap<String, Integer> areaMap = new HashMap<>();
	//静态代码块,将数据先加载到内存中
	static{
		areaMap.put("134", 0);
		areaMap.put("135", 1);
		areaMap.put("137", 2);
		areaMap.put("138", 3);
	}
	@Override
	public int getPartition(KEY key, VALUE value, int numPartitions) {
		Integer provinceCode = areaMap.get(key.toString().substring(0,3));
		
		return provinceCode==null?4:provinceCode;
	}

}
FlowCountPartition.java

package club.drguo.mapreduce.partition;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import club.drguo.mapreduce.flowcount.FlowBean;
//club.drguo.mapreduce.partition.FlowCountPartition
public class FlowCountPartition {
	public static class FlowCountPartitionMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
		// 减少内存占用(如果放下面,GC机制会使对象越积越多)
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
				throws IOException, InterruptedException {
			try {
				// 拿到一行数据
				String line = value.toString();
				// 切分字段
				String[] strings = StringUtils.split(line, "\t");
				// 拿到我们需要的若干个字段
				String phoneNum = strings[1];
				long up_flow = Long.parseLong(strings[strings.length - 3]);
				long down_flow = Long.parseLong(strings[strings.length - 2]);
				// 将数据封装到一个flowbean中
				flowBean.set(phoneNum, up_flow, down_flow);
				// 以手机号为key,将流量数据输出去
				context.write(new Text(phoneNum), flowBean);
			} catch (Exception e) {
				System.out.println("-----------------mapper出现问题");
			}
		}
	}

	// 出去的是手机号和flowbean
	public static class FlowCountPartitionReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
		// 减少内存占用(如果放下面,GC机制会使对象越积越多)
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,
				Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {
			long up_flow_sum = 0;
			long down_flow_sum = 0;
			for (FlowBean bean : values) {
				up_flow_sum += bean.getUp_flow();
				down_flow_sum += bean.getDown_flow();
			}
			flowBean.set(key.toString(), up_flow_sum, down_flow_sum);
			context.write(key, flowBean);
		}
	}
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration, "flowpartjob");
		job.setJarByClass(FlowCountPartition.class);

		job.setMapperClass(FlowCountPartitionMapper.class);
		job.setReducerClass(FlowCountPartitionReducer.class);

		/**
		 * 加入自定义分区定义:AreaPartitioner
		 */
		job.setPartitionerClass(AreaPartitioner.class);
		
		/**
		 * 设置reduce task的数量,要跟AreaPartitioner返回的partitioner个数匹配
		 * 若reduce task多,会产生多余的几个空文件
		 * 若reduce task少,就会发生异常,因为有一些key没有对应reduce task接收
		 * 但reduce task数量为1时,不会产生异常,因为所有key都会给这一个reduce task
		 * reduce task和map task指的是reducer和mapper在集群中运行的实例
		 */
		job.setNumReduceTasks(5);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		// 可以不写,默认就是下面的
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		//没写死,输命令时自己写
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean b = job.waitForCompletion(true);
		System.out.println(b ? "完成" : "未完成");
	}
}
源码地址:http://git.oschina.net/drguo/MapReduceDemo

你可能感兴趣的:([置顶] 第二个MapReduce程序----flowcount(流量统计,自定义排序,自定义分区))