分为两步
第一步代码实现
package club.drguo.mapreduce.inverseindex; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 倒排索引的第一个步骤 * * @author guo * */ // club.drguo.mapreduce.inverseindex.InverseIndexStepOne public class InverseIndexStepOne { // <输入:LongWritable(那一行的起始偏移量),Text(那一行的所有数据) // 输出:Text(hello-->a.txt),LongWritable(出现的次数)> public static class InverseIndexStepOneMapper extends Mapper<LongWritable, Text, Text, LongWritable> { private Text k = new Text(); private LongWritable v = new LongWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 拿到第一行数据 String line = value.toString(); // 切分 String[] words = StringUtils.split(line, " "); /** * 就是先知道切分的word来自哪个文件(word--x.txt) */ // 获取本次调用传递进来的数据所在的文件信息,先要获取所属切片信息 FileSplit inputSplit = (FileSplit) context.getInputSplit(); // 从切片信息中获取到文件路径及文件名 String fileName = inputSplit.getPath().getName(); // 输出 kv对 < hello--a.txt , 1> for (String word : words) { k.set(word + "--" + fileName); v.set(1); context.write(k, v); // context.write(new Text(word + "--" + fileName), new // LongWritable(1)); } } } public static class InverseIndexStepOneReducer extends Reducer<Text, LongWritable, Text, LongWritable> { private LongWritable v = new LongWritable(); // <hello-->a.txt ,{1,1,1...}> @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { // 遍历values进行累加 long count = 0; for (LongWritable value : values) { count += value.get(); } v.set(count); context.write(key, v);// key还是原来的key(hello-->a.txt) } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job_stepOne = Job.getInstance(conf); job_stepOne.setJarByClass(InverseIndexStepOne.class); job_stepOne.setMapperClass(InverseIndexStepOneMapper.class); job_stepOne.setReducerClass(InverseIndexStepOneReducer.class); job_stepOne.setOutputKeyClass(Text.class); job_stepOne.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job_stepOne, new Path(args[0])); // 先判断一下,如果输出目录已存在则删除 FileSystem fileSystem = FileSystem.get(conf); Path output = new Path(args[1]); if (fileSystem.exists(output)) { fileSystem.delete(output, true);// true递归删除 } FileOutputFormat.setOutputPath(job_stepOne, new Path(output); System.exit(job_stepOne.waitForCompletion(true) ? 0 : 1); ; } }
guo@guo:~$ hdfs dfs -mkdir /data/inverseindex guo@guo:~$ hdfs dfs -put /home/guo/a.txt b.txt c.txt /data/inverseindex guo@guo:~$ hdfs dfs -ls /data/inverseindex Found 3 items -rw-r--r-- 1 guo supergroup 35 2016-03-20 16:11 /data/inverseindex/a.txt -rw-r--r-- 1 guo supergroup 37 2016-03-20 16:11 /data/inverseindex/b.txt -rw-r--r-- 1 guo supergroup 38 2016-03-20 16:11 /data/inverseindex/c.txt guo@guo:~$ hadoop jar /home/guo/inverseindex.jar club.drguo.mapreduce.inverseindex.InverseIndexStepOne /data/inverseindex /data/output/inverseindex查看结果
guo@guo:~$ hdfs dfs -cat /data/output/inverseindex/*hadoop--b.txt 1 hadoop--c.txt 3 hello--a.txt 3 hello--b.txt 2 hello--c.txt 1 map--a.txt 1 map--b.txt 1 map--c.txt 1 reduce--a.txt 1 reduce--b.txt 2 reduce--c.txt 1 world--a.txt 1
第二步代码实现
package club.drguo.mapreduce.inverseindex; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; //club.drguo.mapreduce.inverseindex.InverseIndexStepTwo public class InverseIndexStepTwo { // K:起始偏移量 V: {hello--a.txt 3} public static class InverseIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> { private Text k = new Text(); private Text v = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); //切分出各個字段 String[] strings = StringUtils.split(line, "\t"); String wordAndFile = strings[0]; long count = Long.parseLong(strings[1]); String[] wordAndFileName = StringUtils.split(wordAndFile, "--"); String word = wordAndFileName[0]; String fileName = wordAndFileName[1]; //將單詞作爲key,文件-->次數作爲value輸出 形式:<hello, a.txt--3> k.set(word); v.set(fileName+"--"+count); context.write(k, v); } } public static class InverseIndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{ @Override protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { //拿到的数据<hello,{a.txt--3,b.txt--2,...}> String result = ""; for(Text value : values){ result += value+" "; } context.write(key, new Text(result));//输出结果 K:hello V:a.txt--3 b.txt--2 ... } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job_stepTwo = Job.getInstance(conf); job_stepTwo.setJarByClass(InverseIndexStepTwo.class); job_stepTwo.setMapperClass(InverseIndexStepTwoMapper.class); job_stepTwo.setReducerClass(InverseIndexStepTwoReducer.class); job_stepTwo.setOutputKeyClass(Text.class); job_stepTwo.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job_stepTwo, new Path(args[0])); // 先判断一下,如果输出目录已存在则删除 FileSystem fileSystem = FileSystem.get(conf); Path output = new Path(args[1]); if (fileSystem.exists(output)) { fileSystem.delete(output, true);// true递归删除 } FileOutputFormat.setOutputPath(job_stepTwo, output); System.exit(job_stepTwo.waitForCompletion(true) ? 0 : 1); } }
guo@guo:~$ hadoop jar /home/guo/inverseindex2.jar club.drguo.mapreduce.inverseindex.InverseIndexStepTwo /data/output/inverseindex /data/output/inverseindex2查看结果
guo@guo:~$ hdfs dfs -cat /data/output/inverseindex2/*hadoop c.txt--3 b.txt--1 hello c.txt--1 b.txt--2 a.txt--3 map c.txt--1 b.txt--1 a.txt--1 reduce c.txt--1 b.txt--2 a.txt--1 world a.txt--1