[置顶] 第三个MapReduce程序----倒排索引inverseindex

分为两步

第一步代码实现

package club.drguo.mapreduce.inverseindex;

import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 倒排索引的第一个步骤
 * 
 * @author guo
 *
 */
// club.drguo.mapreduce.inverseindex.InverseIndexStepOne
public class InverseIndexStepOne {
	// <输入:LongWritable(那一行的起始偏移量),Text(那一行的所有数据)
	// 输出:Text(hello-->a.txt),LongWritable(出现的次数)>
	public static class InverseIndexStepOneMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
		private Text k = new Text();
		private LongWritable v = new LongWritable();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			// 拿到第一行数据
			String line = value.toString();
			// 切分
			String[] words = StringUtils.split(line, " ");
			/**
			 * 就是先知道切分的word来自哪个文件(word--x.txt)
			 */
			// 获取本次调用传递进来的数据所在的文件信息,先要获取所属切片信息
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			// 从切片信息中获取到文件路径及文件名
			String fileName = inputSplit.getPath().getName();

			// 输出 kv对 < hello--a.txt , 1>
			for (String word : words) {
				k.set(word + "--" + fileName);
				v.set(1);
				context.write(k, v);
				// context.write(new Text(word + "--" + fileName), new
				// LongWritable(1));
			}

		}

	}

	public static class InverseIndexStepOneReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
		private LongWritable v = new LongWritable();

		// <hello-->a.txt ,{1,1,1...}>
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {

			// 遍历values进行累加
			long count = 0;
			for (LongWritable value : values) {
				count += value.get();
			}
			v.set(count);
			context.write(key, v);// key还是原来的key(hello-->a.txt)
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();

		Job job_stepOne = Job.getInstance(conf);

		job_stepOne.setJarByClass(InverseIndexStepOne.class);

		job_stepOne.setMapperClass(InverseIndexStepOneMapper.class);
		job_stepOne.setReducerClass(InverseIndexStepOneReducer.class);

		job_stepOne.setOutputKeyClass(Text.class);
		job_stepOne.setOutputValueClass(LongWritable.class);

		FileInputFormat.setInputPaths(job_stepOne, new Path(args[0]));
		// 先判断一下,如果输出目录已存在则删除
		FileSystem fileSystem = FileSystem.get(conf);
		Path output = new Path(args[1]);
		if (fileSystem.exists(output)) {
			fileSystem.delete(output, true);// true递归删除
		}
		FileOutputFormat.setOutputPath(job_stepOne, new Path(output);

		System.exit(job_stepOne.waitForCompletion(true) ? 0 : 1);
		;

	}

}

导出jar包后运行(先把测试数据上传)

guo@guo:~$ hdfs dfs -mkdir /data/inverseindex
guo@guo:~$ hdfs dfs -put /home/guo/a.txt b.txt c.txt /data/inverseindex
guo@guo:~$ hdfs dfs -ls /data/inverseindex
Found 3 items
-rw-r--r--   1 guo supergroup         35 2016-03-20 16:11 /data/inverseindex/a.txt
-rw-r--r--   1 guo supergroup         37 2016-03-20 16:11 /data/inverseindex/b.txt
-rw-r--r--   1 guo supergroup         38 2016-03-20 16:11 /data/inverseindex/c.txt
guo@guo:~$ hadoop jar /home/guo/inverseindex.jar club.drguo.mapreduce.inverseindex.InverseIndexStepOne /data/inverseindex /data/output/inverseindex
查看结果

guo@guo:~$ hdfs dfs -cat /data/output/inverseindex/*hadoop--b.txt	1
hadoop--c.txt	3
hello--a.txt	3
hello--b.txt	2
hello--c.txt	1
map--a.txt	1
map--b.txt	1
map--c.txt	1
reduce--a.txt	1
reduce--b.txt	2
reduce--c.txt	1
world--a.txt	1

第二步代码实现

package club.drguo.mapreduce.inverseindex;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//club.drguo.mapreduce.inverseindex.InverseIndexStepTwo
public class InverseIndexStepTwo {
	// K:起始偏移量 V: {hello--a.txt	3}
	public static class InverseIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {
		private Text k = new Text();
		private Text v = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			//切分出各個字段
			String[] strings = StringUtils.split(line, "\t");
			
			String wordAndFile = strings[0];
			long count = Long.parseLong(strings[1]);
			String[] wordAndFileName = StringUtils.split(wordAndFile, "--");
			
			String word = wordAndFileName[0];
			String fileName = wordAndFileName[1];
			//將單詞作爲key,文件-->次數作爲value輸出 形式:<hello, a.txt--3>
			k.set(word);
			v.set(fileName+"--"+count);
			context.write(k, v);
		}
	}
	public static class InverseIndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
		@Override
		protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			//拿到的数据<hello,{a.txt--3,b.txt--2,...}>
			String result = "";
			for(Text value : values){
				result += value+" ";
			}
			context.write(key, new Text(result));//输出结果 K:hello V:a.txt--3 b.txt--2 ...
		}
	}
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();

		Job job_stepTwo = Job.getInstance(conf);

		job_stepTwo.setJarByClass(InverseIndexStepTwo.class);

		job_stepTwo.setMapperClass(InverseIndexStepTwoMapper.class);
		job_stepTwo.setReducerClass(InverseIndexStepTwoReducer.class);

		job_stepTwo.setOutputKeyClass(Text.class);
		job_stepTwo.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job_stepTwo, new Path(args[0]));
		// 先判断一下,如果输出目录已存在则删除
		FileSystem fileSystem = FileSystem.get(conf);
		Path output = new Path(args[1]);
		if (fileSystem.exists(output)) {
			fileSystem.delete(output, true);// true递归删除
		}
		FileOutputFormat.setOutputPath(job_stepTwo, output);

		System.exit(job_stepTwo.waitForCompletion(true) ? 0 : 1);

	}
}

导出jar包后运行(数据用第一步计算后的数据)

guo@guo:~$ hadoop jar /home/guo/inverseindex2.jar club.drguo.mapreduce.inverseindex.InverseIndexStepTwo /data/output/inverseindex /data/output/inverseindex2
查看结果

guo@guo:~$ hdfs dfs -cat /data/output/inverseindex2/*hadoop	c.txt--3 b.txt--1 
hello	c.txt--1 b.txt--2 a.txt--3 
map	c.txt--1 b.txt--1 a.txt--1 
reduce	c.txt--1 b.txt--2 a.txt--1 
world	a.txt--1 


你可能感兴趣的:([置顶] 第三个MapReduce程序----倒排索引inverseindex)