Scala Spark 求众数

1.数据格式

1   2   3
1   4   5
4   5   6
4   7   8
7   8   9
10  11  12
10  13  14
10  1   2
1   100 100
10  11  2
10  11  2
1   2   5
4   7   6

2.程序

val conf = new SparkConf().setAppName("Mode")
conf.setMaster("local[3]")
val sc.new SparkContext(conf)

val data = sc.textFile("/home/i.txt")  //读入测试数据
val dataMap = data.map(_.split("\t"))
.map(f=>f.map(f=>f.toDouble))
.map(f=>("k"+f(0),f(1)))
//dataMap: RDD[(String, Double)]

val dc = dataMap.countByValue().map(x=>(x._1._1,x._1._2,x._2)).toIndexedSeq
//将RDD countByValue后重新组合格式
//((key,value),count)=>(key,value,count)=>IndexedSeq

val dcm = sc.makeRDD(dc).map(x=>(x._1,(Array(x._2,x._3))))
dc.foreach(println(_))
//output
(k1.0,4.0,1)
(k10.0,1.0,1)
(k1.0,100.0,1)
(k10.0,13.0,1)
(k10.0,11.0,3)
(k4.0,7.0,2)
(k7.0,8.0,1)
(k1.0,2.0,2)
(k4.0,5.0,1)
// 得到count

def max(x:Array[Double], y:Array[Double]) = {
      if (x(1) > y(1)) x
      else  y
    }

val dcr = dcm.reduceByKey((x,y)=>(max(x,y))).collect()
// 得到出现次数做多的记录
dcr.foreach(
x=>{
  print(x._1+" ")
  x._2.foreach(println(_))
 }
)
//output
k1.0 2.0
2.0
k7.0 8.0
1.0
k10.0 11.0
3.0
k4.0 7.0
2.0

你可能感兴趣的:(scala,spark)