在Hadoop的streaming中有一个选项是指定输入输出格式化的
-inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional. -outputformat TextOutputFormat(default)|JavaClassName Optional.
但是在0.14版本之后,hadoop不再支持带多个jar包文件,所以,如果要使用自己定义的Inputformat或者Outputformat,就得将对应的class文件加入到hadoop-streaming-1.0.1.jar中去,比如:
jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class
然后在-inputformat后面就可以直接带类名了。
下面通过一个例子来说明下,实现Map的输入<key,value>,key为文件名,value为文档的整篇内容:
1.定义自己的InputFormat:
ContentRecordReder.java
package org.apache.hadoop.streaming; import java.io.IOException; //import org.apache.commons.logging.Log; //import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.RecordReader; import com.sun.org.apache.commons.logging.Log; import com.sun.org.apache.commons.logging.LogFactory; public class ContentRecordReder implements RecordReader<Text,Text> { private static final Log LOG = LogFactory.getLog(ContentRecordReder.class.getName()); private CompressionCodecFactory compressionCodecs = null; private long start; private long pos; private long end; private byte[] buffer; private String keyName; private FSDataInputStream fileIn; public ContentRecordReder(Configuration job,FileSplit split) throws IOException{ start = split.getStart(); //从中可以看出每个文件是作为一个split的 end = split.getLength() + start; final Path path = split.getPath(); keyName = path.toString(); LOG.info("filename in hdfs is : " + keyName); System.out.println("filename in hdfs is : " + keyName); final FileSystem fs = path.getFileSystem(job); fileIn = fs.open(path); fileIn.seek(start); buffer = new byte[(int)(end - start)]; this.pos = start; } public Text createKey() { return new Text(); } public Text createValue() { return new Text(); } public long getPos() throws IOException{ return pos; } public float getProgress() { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (pos - start) / (float)(end - start)); } } public boolean next(Text key, Text value) throws IOException{ while(pos < end) { key.set(keyName); value.clear(); fileIn.readFully(pos,buffer); value.set(buffer); LOG.info("---内容: " + value.toString()); System.out.println("---内容: " + value.toString()); pos += buffer.length; LOG.info("end is : " + end + " pos is : " + pos); return true; } return false; } public void close() throws IOException{ if(fileIn != null) { fileIn.close(); } } }
package org.apache.hadoop.streaming; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.FileInputFormat; public class ContentInputFormat extends FileInputFormat<Text,Text>{ private long mySplitSize = 1024*1024; private CompressionCodecFactory compressionCodecs = null; public void configure(JobConf conf) { compressionCodecs = new CompressionCodecFactory(conf); } /** * @brief isSplitable 不对文件进行切分,必须对文件整体进行处理 * * @param fs * @param file * * @return false */ protected boolean isSplitable(FileSystem fs, Path file) { return false; } public RecordReader<Text,Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException{ reporter.setStatus(genericSplit.toString()); ContentRecordReder contentRecordReder = new ContentRecordReder(job,(FileSplit)genericSplit); return (RecordReader<Text, Text>) contentRecordReder; } }
2.编译
javac -classpath ~/hadoop-1.0.1/hadoop-core-1.0.1.jar:~/hadoop-1.0.1/lib/*:./content-record-reader.jar ./*.java -Xlint:deprecation
jar uf ../../contrib/streaming/hadoop-streaming-1.0.1.jar org/apache/hadoop/streaming/*.class
4.Mapper.cpp
#include <iostream> #include <string> using namespace std; int main() { string key,value; char ch; cin>>key; value = ""; while(cin>>ch&&!cin.eof()){ value.append(1,ch); } cout<<key<<"\t"<<value<<endl; return 0; }
#include <iostream> #include <map> using namespace std; int main() { map<string,string> wordMap; map<string,string>::iterator it; string key; string value; while(cin>>key>>value) { //可以在这里对value即文档做处理... wordMap[key] +=value; } for(it=wordMap.begin();it != wordMap.end();it++) {//输出 cout<<it->first<<"\t"<<it->second<<endl; } return 0; }
bin/hadoop jar contrib/streaming/hadoop-streaming-1.0.1.jar \ -mapper /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \ -file /home/guoguo/hadoop-1.0.1/cTest/C++/Mapper \ -inputformat ContentInputFormat \ -reducer /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \ -file /home/guoguo/hadoop-1.0.1/cTest/C++/Reducer \ -input input \ -output stream-output
-inputreader "StreamXmlRecordReader,begin=<Store>,end=</Store>"
http://hadoop.apache.org/common/docs/r1.0.1/streaming.html
http://dongxicheng.org/mapreduce/hadoop-streaming-advanced-programming/
http://blog.csdn.net/j3smile/article/details/7371209
http://blog.csdn.net/anbo724/article/details/6955175