简单实现CombineFileInputFormat

import java.io.DataOutput;

import java.io.IOException;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.Reducer.Context;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;

import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.ReflectionUtils;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;



public class TestCombine extends Configured implements Tool {

	private static class ProvinceMapper extends

			Mapper<Object, Text, Text, Text> {

		@Override

		protected void map(Object key, Text value, Context context)

				throws IOException, InterruptedException {

			System.out.println("value : " + value + " Context " + context);

			context.write(value, value);

		}

	}



	private static class ProvinceReducer extends

			Reducer<Text, Text, Text, Text> {

		@Override

		protected void reduce(Text key, Iterable<Text> values, Context context)

				throws IOException, InterruptedException {

			for (Text va : values) {

			    System.out.println("reduce " + key);

				context.write(key, key);

			}

		}

	}

	

	public static class CombineSequenceFileInputFormat<K, V> extends CombineFileInputFormat<K, V> {  

	    @SuppressWarnings({ "unchecked", "rawtypes" })  

	    @Override  

	    public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {  

	        return new CombineFileRecordReader((CombineFileSplit)split, context, CombineLineRecordReader.class);  

	    }  

	}  

	

	public static class CombineLineRecordReader<K, V> extends RecordReader<K, V> {  

	    private CombineFileSplit split;  

	    private TaskAttemptContext context;  

	    private int index;  

	    private RecordReader<K, V> rr;  

	  

	    @SuppressWarnings("unchecked")  

	    public CombineLineRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException, InterruptedException {  

	        this.index = index;

	        this.split = (CombineFileSplit) split;  

	        this.context = context;  

	  

	        this.rr = (RecordReader<K, V>) ReflectionUtils.newInstance(LineRecordReader.class, context.getConfiguration());  

	    }  

	  

	    @SuppressWarnings("unchecked")  

	    @Override  

	    public void initialize(InputSplit curSplit, TaskAttemptContext curContext) throws IOException, InterruptedException {  

	        this.split = (CombineFileSplit) curSplit;  

	        this.context = curContext;  

	  

	        if (null == rr) {  

	            rr = ReflectionUtils.newInstance(SequenceFileRecordReader.class, context.getConfiguration());  

	        }  

	  

	        FileSplit fileSplit = new FileSplit(this.split.getPath(index),  

	                this.split.getOffset(index), this.split.getLength(index),  

	                this.split.getLocations());  

	          

	        this.rr.initialize(fileSplit, this.context);  

	    }  

	  

	    @Override  

	    public float getProgress() throws IOException, InterruptedException {  

	        return rr.getProgress();  

	    }  

	  

	    @Override  

	    public void close() throws IOException {  

	        if (null != rr) {  

	            rr.close();  

	            rr = null;  

	        }  

	    }  

	  

	    @Override  

	    public K getCurrentKey()  

	    throws IOException, InterruptedException {  

	        return rr.getCurrentKey();  

	    }  

	  

	    @Override  

	    public V getCurrentValue()  

	    throws IOException, InterruptedException {  

	        return rr.getCurrentValue();  

	    }  

	  

	    @Override  

	    public boolean nextKeyValue() throws IOException, InterruptedException {  

	        return rr.nextKeyValue();  

	    }  

	}  



	

	public int run(String[] args) throws Exception {

		Configuration conf = new Configuration();

		

		Job job = new Job(conf);

		job.setJobName("TestCombine");

		job.setJarByClass(TestCombine.class);



		job.setMapperClass(ProvinceMapper.class);

		job.setReducerClass(ProvinceReducer.class);

		

		job.setInputFormatClass(CombineSequenceFileInputFormat.class);

		

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		

		String inpath = "/home/hadoop/tmp/combine";

		String outpath = "/home/hadoop/tmp/combineout";

		Path p = new Path(outpath);

		

		FileSystem fs = FileSystem.get(conf);

		if (fs.exists(p)){

			fs.delete(p);

		}

		FileInputFormat.addInputPaths(job, inpath);

		FileOutputFormat.setOutputPath(job, p);



		return job.waitForCompletion(true) ? 0 : 1;

	} 



	public static void main(String[] args) throws Exception {

		int ret = ToolRunner.run(new TestCombine(), args);

		System.exit(ret);

	} 

} 

 

你可能感兴趣的:(format)