在网站的数据统计中,有这样一种情况,即统计某个用户发表的评论数、第一次发表评论的时间和最后一次发表评论的时间。下面代码就是解决comments.xml的这个问题。代码如下:
package mrdp.ch2; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import mrdp.utils.MRDPUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class MinMaxCountDriver { public static class SOMinMaxCountMapper extends Mapper<Object, Text, Text, MinMaxCountTuple> { // Our output key and value Writables private Text outUserId = new Text(); private MinMaxCountTuple outTuple = new MinMaxCountTuple(); // This object will format the creation date string into a Date object private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // Parse the input string into a nice map Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString()); // Grab the "CreationDate" field since it is what we are finding // the min and max value of String strDate = parsed.get("CreationDate"); // Grab the “UserID” since it is what we are grouping by String userId = parsed.get("UserId"); // .get will return null if the key is not there if (strDate == null || userId == null) { // skip this record return; } try { // Parse the string into a Date object Date creationDate = frmt.parse(strDate); // Set the minimum and maximum date values to the creationDate outTuple.setMin(creationDate); outTuple.setMax(creationDate); // Set the comment count to 1 outTuple.setCount(1); // Set our user ID as the output key outUserId.set(userId); // Write out the user ID with min max dates and count context.write(outUserId, outTuple); } catch (ParseException e) { // An error occurred parsing the creation Date string // skip this record } } } public static class SOMinMaxCountReducer extends Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> { private MinMaxCountTuple result = new MinMaxCountTuple(); @Override public void reduce(Text key, Iterable<MinMaxCountTuple> values, Context context) throws IOException, InterruptedException { // Initialize our result result.setMin(null); result.setMax(null); int sum = 0; // Iterate through all input values for this key for (MinMaxCountTuple val : values) { // If the value's min is less than the result's min // Set the result's min to value's if (result.getMin() == null || val.getMin().compareTo(result.getMin()) < 0) { result.setMin(val.getMin()); } // If the value's max is less than the result's max // Set the result's max to value's if (result.getMax() == null || val.getMax().compareTo(result.getMax()) > 0) { result.setMax(val.getMax()); } // Add to our sum the count for val sum += val.getCount(); } // Set our count to the number of input values result.setCount(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: MinMaxCountDriver <in> <out>"); System.exit(2); } Job job = new Job(conf, "StackOverflow Comment Date Min Max Count"); job.setJarByClass(MinMaxCountDriver.class); job.setMapperClass(SOMinMaxCountMapper.class); job.setCombinerClass(SOMinMaxCountReducer.class); job.setReducerClass(SOMinMaxCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MinMaxCountTuple.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class MinMaxCountTuple implements Writable { private Date min = new Date(); private Date max = new Date(); private long count = 0; private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public Date getMin() { return min; } public void setMin(Date min) { this.min = min; } public Date getMax() { return max; } public void setMax(Date max) { this.max = max; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } @Override public void readFields(DataInput in) throws IOException { min = new Date(in.readLong()); max = new Date(in.readLong()); count = in.readLong(); } @Override public void write(DataOutput out) throws IOException { out.writeLong(min.getTime()); out.writeLong(max.getTime()); out.writeLong(count); } @Override public String toString() { return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count; } } }这里的mrdp.utils.MRDPUtils包的代码在第一篇中已经给出。
这里最重要的是自己重写了writable函数,自己定义了value类型。有时间我另开一篇博客介绍下writable函数。
map阶段不做任何比较和计算,只是简单的对comments.xml进行解析,然后把每次评论的时间解析出来,并把count赋值为1.如解析下一列
<row Id="1784" PostId="883" Text="Perfect distinction. I've made a note and agree entirely." CreationDate="2012-02-08T21:51:05.223" UserId="46" />
compiler阶段直接调用的reduce函数,做中间处理。
reducer阶段计算我们需要的数据,即求最大值,最小值,总数。reducer的时间较简单,就是把每个uid对应的value循环取出,然后一一做比较,并计算count.
整个流程如下图:
得到的部分结果如下:
jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output2/part-r-00000 10 2011-02-14T18:04:38.763 2012-07-10T22:57:00.757 8 101 2011-04-01T03:02:45.083 2011-04-01T06:02:33.307 2 10119 2012-02-08T13:54:38.623 2012-04-12T23:43:14.810 8 1057 2011-06-17T19:59:33.013 2011-06-17T19:59:33.013 1 10691 2012-04-19T01:15:44.573 2012-05-11T05:47:36.517 2 10872 2012-06-14T15:36:26.527 2012-06-14T15:45:43.347 4 10921 2011-12-07T18:08:04.583 2011-12-07T18:08:04.583 1 11 2011-05-06T02:51:50.370 2011-05-06T14:46:31.483 3 110 2010-08-12T14:52:09.830 2010-08-12T14:52:09.830 1 1118 2011-02-17T10:27:48.623 2011-02-25T09:25:09.597 2 11498 2011-12-30T11:09:58.057 2011-12-30T11:09:58.057 1 11682 2012-01-04T21:48:39.267 2012-01-04T21:48:39.267 1