Hadoop自定义数据类型编程练习
代码:
package zidongyi; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; importorg.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat; importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat; importorg.apache.hadoop.mapreduce.lib.partition.HashPartitioner; public class KpiApp { staticfinal String INPUT_PATH ="hdfs://192.168.1.100:9000/input/HTTP_20130313143750.dat"; staticfinal String OUT_PATH = "hdfs://192.168.1.100:9000/output/out02"; publicstatic void main(String[] args) throws Exception{ finalJob job = new Job(new Configuration(), KpiApp.class.getSimpleName()); //1.1指定输入文件路径 FileInputFormat.setInputPaths(job,INPUT_PATH); //指定哪个类用来格式化输入文件 job.setInputFormatClass(TextInputFormat.class); //1.2指定自定义的Mapper类 job.setMapperClass(MyMapper.class); //指定输出<k2,v2>的类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(KpiWritable.class); //1.3指定分区类 job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); //1.4TODO 排序、分区 //1.5 TODO (可选)合并 //2.2指定自定义的reduce类 job.setReducerClass(MyReducer.class); //指定输出<k3,v3>的类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(KpiWritable.class); //2.3指定输出到哪里 FileOutputFormat.setOutputPath(job,new Path(OUT_PATH)); //设定输出文件的格式化类 job.setOutputFormatClass(TextOutputFormat.class); //把代码提交给JobTracker执行 job.waitForCompletion(true); } staticclass MyMapper extends Mapper<LongWritable, Text, Text, KpiWritable>{ protectedvoid map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,KpiWritable>.Contextcontext) throws IOException ,InterruptedException { finalString[] splited = value.toString().split("\t"); finalString msisdn = splited[1]; finalText k2 = new Text(msisdn); finalKpiWritable v2 = new KpiWritable(splited[6],splited[7],splited[8],splited[9]); context.write(k2,v2); }; } staticclass MyReducer extends Reducer<Text, KpiWritable, Text, KpiWritable>{ /** * @param k2 表示整个文件中不同的手机号码 * @param v2s 表示该手机号在不同时段的流量的集合 */ protectedvoid reduce(Text k2, java.lang.Iterable<KpiWritable> v2s,org.apache.hadoop.mapreduce.Reducer<Text,KpiWritable,Text,KpiWritable>.Contextcontext) throws IOException ,InterruptedException { longupPackNum = 0L; longdownPackNum = 0L; longupPayLoad = 0L; longdownPayLoad = 0L; for(KpiWritable kpiWritable : v2s) { upPackNum+= kpiWritable.upPackNum; downPackNum+= kpiWritable.downPackNum; upPayLoad+= kpiWritable.upPayLoad; downPayLoad+= kpiWritable.downPayLoad; } finalKpiWritable v3 = new KpiWritable(upPackNum+"",downPackNum+"", upPayLoad+"", downPayLoad+""); context.write(k2,v3); }; } } class KpiWritable implements Writable{ longupPackNum; longdownPackNum; longupPayLoad; longdownPayLoad; publicKpiWritable(){} publicKpiWritable(String upPackNum, String downPackNum, String upPayLoad, StringdownPayLoad){ this.upPackNum= Long.parseLong(upPackNum); this.downPackNum= Long.parseLong(downPackNum); this.upPayLoad= Long.parseLong(upPayLoad); this.downPayLoad= Long.parseLong(downPayLoad); } @Override publicvoid readFields(DataInput in) throws IOException { this.upPackNum= in.readLong(); this.downPackNum= in.readLong(); this.upPayLoad= in.readLong(); this.downPayLoad= in.readLong(); } @Override publicvoid write(DataOutput out) throws IOException { out.writeLong(upPackNum); out.writeLong(downPackNum); out.writeLong(upPayLoad); out.writeLong(downPayLoad); } @Override publicString toString() { returnupPackNum + "\t" + downPackNum + "\t" + upPayLoad +"\t" + downPayLoad; } }
数据上传到HDFS上面:
运行过程Console:
15/02/2200:04:22 WARN util.NativeCodeLoader: Unable to load native-hadoop library foryour platform... using builtin-java classes where applicable 15/02/2200:04:22 WARN mapred.JobClient: Use GenericOptionsParser for parsing thearguments. Applications should implement Tool for the same. 15/02/2200:04:22 WARN mapred.JobClient: No job jar file set. User classes may not be found. SeeJobConf(Class) or JobConf#setJar(String). 15/02/2200:04:22 INFO input.FileInputFormat: Total input paths to process : 1 15/02/2200:04:22 WARN snappy.LoadSnappy: Snappy native library not loaded 15/02/2200:04:23 INFO mapred.JobClient: Running job: job_local1887351217_0001 15/02/2200:04:23 INFO mapred.LocalJobRunner: Waiting for map tasks 15/02/2200:04:23 INFO mapred.LocalJobRunner: Starting task:attempt_local1887351217_0001_m_000000_0 15/02/2200:04:23 INFO mapred.Task: UsingResourceCalculatorPlugin : null 15/02/2200:04:23 INFO mapred.MapTask: Processing split: hdfs://192.168.1.100:9000/input/HTTP_20130313143750.dat:0+2214 15/02/2200:04:23 INFO mapred.MapTask: io.sort.mb = 100 15/02/2200:04:23 INFO mapred.MapTask: data buffer = 79691776/99614720 15/02/2200:04:23 INFO mapred.MapTask: record buffer = 262144/327680 15/02/2200:04:23 INFO mapred.MapTask: Starting flush of map output 15/02/2200:04:23 INFO mapred.MapTask: Finished spill 0 15/02/2200:04:23 INFO mapred.Task: Task:attempt_local1887351217_0001_m_000000_0 isdone. And is in the process of commiting 15/02/2200:04:23 INFO mapred.LocalJobRunner: 15/02/2200:04:23 INFO mapred.Task: Task 'attempt_local1887351217_0001_m_000000_0' done. 15/02/2200:04:23 INFO mapred.LocalJobRunner: Finishing task:attempt_local1887351217_0001_m_000000_0 15/02/2200:04:23 INFO mapred.LocalJobRunner: Map task executor complete. 15/02/2200:04:23 INFO mapred.Task: UsingResourceCalculatorPlugin : null 15/02/2200:04:23 INFO mapred.LocalJobRunner: 15/02/2200:04:23 INFO mapred.Merger: Merging 1 sorted segments 15/02/2200:04:23 INFO mapred.Merger: Down to the last merge-pass, with 1 segments leftof total size: 1011 bytes 15/02/2200:04:23 INFO mapred.LocalJobRunner: 15/02/2200:04:24 INFO mapred.Task: Task:attempt_local1887351217_0001_r_000000_0 isdone. And is in the process of commiting 15/02/2200:04:24 INFO mapred.LocalJobRunner: 15/02/2200:04:24 INFO mapred.Task: Task attempt_local1887351217_0001_r_000000_0 isallowed to commit now 15/02/2200:04:24 INFO mapred.JobClient: map 100%reduce 0% 15/02/2200:04:24 INFO output.FileOutputCommitter: Saved output of task'attempt_local1887351217_0001_r_000000_0' tohdfs://192.168.1.100:9000/output/out02 15/02/2200:04:24 INFO mapred.LocalJobRunner: reduce > reduce 15/02/2200:04:24 INFO mapred.Task: Task 'attempt_local1887351217_0001_r_000000_0' done. 15/02/2200:04:25 INFO mapred.JobClient: map 100%reduce 100% 15/02/2200:04:25 INFO mapred.JobClient: Job complete: job_local1887351217_0001 15/02/2200:04:25 INFO mapred.JobClient: Counters: 19 15/02/2200:04:25 INFO mapred.JobClient: FileOutput Format Counters 15/02/2200:04:25 INFO mapred.JobClient: BytesWritten=556 15/02/2200:04:25 INFO mapred.JobClient: FileInput Format Counters 15/02/2200:04:25 INFO mapred.JobClient: BytesRead=2214 15/02/2200:04:25 INFO mapred.JobClient: FileSystemCounters 15/02/2200:04:25 INFO mapred.JobClient: FILE_BYTES_READ=1365 15/02/2200:04:25 INFO mapred.JobClient: HDFS_BYTES_READ=4428 15/02/2200:04:25 INFO mapred.JobClient: FILE_BYTES_WRITTEN=141054 15/02/2200:04:25 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=556 15/02/2200:04:25 INFO mapred.JobClient: Map-Reduce Framework 15/02/2200:04:25 INFO mapred.JobClient: Mapoutput materialized bytes=1015 15/02/2200:04:25 INFO mapred.JobClient: Mapinput records=22 15/02/2200:04:25 INFO mapred.JobClient: Reduce shuffle bytes=0 15/02/2200:04:25 INFO mapred.JobClient: Spilled Records=44 15/02/2200:04:25 INFO mapred.JobClient: Mapoutput bytes=965 15/02/2200:04:25 INFO mapred.JobClient: Totalcommitted heap usage (bytes)=323878912 15/02/2200:04:25 INFO mapred.JobClient: Combine input records=0 15/02/2200:04:25 INFO mapred.JobClient: SPLIT_RAW_BYTES=120 15/02/2200:04:25 INFO mapred.JobClient: Reduce input records=22 15/02/2200:04:25 INFO mapred.JobClient: Reduce input groups=21 15/02/2200:04:25 INFO mapred.JobClient: Combine output records=0 15/02/2200:04:25 INFO mapred.JobClient: Reduce output records=21 15/02/2200:04:25 INFO mapred.JobClient: Mapoutput records=22
HDFS显示运行结果:
附数据:
格式为:记录报告时间戳、手机号码、AP mac、AC mac、访问的网址、网址种类、上行数据包数、下行数据包数、上行总流量、下行总流量、HTTP Response的状态。
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200 1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200 1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200 1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200 1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200 1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200 1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200 1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200 1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200 1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200 1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200 1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200 1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200 1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200 1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200 1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200 1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200 1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200 1363157985079 13823070001 20-7C-8F-70-68-1F:CMCC 120.196.100.99 6 3 360 180 200 1363157985069 13600217502 00-1F-64-E2-E8-B1:CMCC 120.196.100.55 18 138 1080 186852 200