Spark自定义分区器

package test.wyh.wordcount

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

object TestPartition {
  def main(args: Array[String]): Unit = {

    //建立Spark连接
    val sparkConf = new SparkConf().setMaster("local").setAppName("TestWordCountApp")
    val sc = new SparkContext(sparkConf)
    val rdd = sc.makeRDD(List(("Zhejiang", "Bob"), ("Shanghai", "Tom"), ("Beijing", "Lily"), ("Shanghai", "Alice")), 3)
    //指定自定义分区器
    val partitionRDD = rdd.partitionBy(new MyPartitioner)
    partitionRDD.saveAsTextFile("output")
    //关闭连接
    sc.stop()

  }

  class MyPartitioner extends Partitioner{
    //设置分区个数
    override def numPartitions: Int = 3

你可能感兴趣的:(大数据之Spark,spark,大数据,分布式)