Spark-自定义分区器以及累加器

自定义分区器:

 val rdd = sc.makeRDD(
      List(
        ("nba", "xxxx"),
        ("cba", "xxxx"),
        ("cba", "xxxx"),
        ("cba", "xxxx"),
        ("nba", "xxxx"),
        ("wnba", "xxxx"),
      ),3
    )
    val rdd1: RDD[(String, String)] = rdd.partitionBy(new MyPartitioner())
    rdd1.saveAsTextFile("output")
    sc.stop()
  }
  class MyPartitioner extends Partitioner{
    override def numPartitions: Int = 3
// 根据数据的key获取所在分区的编号(从0开始)
    override def getPartition(key: Any): Int = {
      key match {
        case "nba" => 0
        case "cba" => 1
        case "wnba" => 2
      }
    }
  }

自定义累加器:

// TODO 算子 - 自定义累加器
    val rdd = sc.makeRDD(List(("Hello", 1), ("Hello", 2), ("Hello", 4)),2)
    //声明累加器
    val wordCount = new WordCountAcc()
    //注册累加器
    sc.register(wordCount,"WordCountAcc")
    // 使用累加器
    rdd.foreach(
      t=>{
        wordCount.add(t)
      }
    )
    //获取累加器的值
    println(wordCount.value)
    sc.stop()
  }
  class WordCountAcc extends AccumulatorV2[(String,Int),mutable.Map[String,Int]]{
    // 创建一个数组来接收
    private  val wordCountMap =mutable.Map[String,Int]()
    // isZero: 当AccumulatorV2中存在类似数据不存在这种问题时,是否结束程序(累加器是否是初始状态)
    override def isZero: Boolean = wordCountMap.isEmpty
    // 拷贝一个新的AccumulateV2
    override def copy(): AccumulatorV2[(String, Int), mutable.Map[String, Int]] = {
      new WordCountAcc()
    }
    // 重置累加器的数据
    override def reset(): Unit = wordCountMap.clear()
// 将数据向累加器中增加
    override def add(v: (String, Int)): Unit = {
      // 查询map中是否存在相同的单词
      // 如果有相同的单词,那么单词的数量加1
      // 如果没有相同的单词,那么在map中增加这个单词
      val (word,cnt) = v
      val oldcnt: Int = wordCountMap.getOrElse(word,0)
      wordCountMap.update(word,cnt+oldcnt)
    }
  // 合并累加器
    override def merge(other: AccumulatorV2[(String, Int), mutable.Map[String, Int]]): Unit = {
      // other.value (是mutable.Map)
      val otherMap: mutable.Map[String, Int] = other.value
      otherMap.foreach{
        case (word,cnt)=>{
          
          val oldcount: Int = this.wordCountMap.getOrElse(word,0)
          this.wordCountMap.update(word,cnt+oldcount)
        }
      }

    }
      // 返回累加器的结果(out)
    override def value: mutable.Map[String, Int] = wordCountMap
  }

你可能感兴趣的:(spark)