spark 读取 kafka 指定偏移量数据

1,读取zookeeper里面的最大偏移量
2,手动输入指定的偏移量,实际生产是从redis读取,这里只是个案例


import java.util

import com.alibaba.fastjson.{JSON, JSONObject}
import com.dianyou.util._
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.{KafkaUtils, LocationStrategies, OffsetRange}




object SparkReadKafkaByOffsets {

  def main(args: Array[String]): Unit = {


    val sparkConf = new SparkConf()

    sparkConf.set("spark.app.name", "local_test")
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.set("spark.debug.maxToStringFields", "1000")
    sparkConf.set("spark.master", "local[*]")



    val spark = SparkSession
      .builder
      .config(sparkConf)
      .getOrCreate()


    val sc = spark.sparkContext


    val kafkaServer = "node4:9092,node5:9092,node6:9092,node7:9092"
//    val kafkaGroup = "group-test2019"
    val kafkaGroup = "group-test2019"
//    val kafkaTopic = "dianyou_wxgz"
    val kafkaTopic = "dianyou_filter"

    //todo kafka的参数
    val kafkaParams: util.HashMap[String, Object] = new util.HashMap[String, Object]()
    kafkaParams.put("bootstrap.servers", kafkaServer)
    kafkaParams.put("key.deserializer", classOf[StringDeserializer])
    kafkaParams.put("value.deserializer", classOf[StringDeserializer])
    kafkaParams.put("group.id", kafkaGroup)
    kafkaParams.put("auto.offset.reset", "earliest")
    kafkaParams.put("enable.auto.commit", (false: java.lang.Boolean))

    val latestOffsetMap: util.Map[Int, Long] = OffsetRangeUtils.getLatestTimeOffset(kafkaServer, kafkaTopic)
    val startOffsetMap: util.Map[Int, Long] = new util.HashMap[Int, Long]()
    startOffsetMap.put(0,520130L)
    startOffsetMap.put(1,520120L)
    startOffsetMap.put(2,520127L)
    startOffsetMap.put(3,520114L)
    //todo 传入指定的偏移量
    val offsetRanges: Array[OffsetRange] = OffsetRangeUtils
      .getAppointOffsetRange(
        kafkaServer,
        kafkaTopic,
        startOffsetMap,
        latestOffsetMap
      )

    for (or <- offsetRanges) {
      println("打印:topic的name" + or.topic + "--------------分区:" + or.partition + "-------------偏移量:" + or.untilOffset.toString)
      println("++++++++++++++++++++++++++++++")
    }

    val rdd: RDD[ConsumerRecord[String, String]] = KafkaUtils
      .createRDD[String, String](
      sc,
      kafkaParams,
      offsetRanges,
      LocationStrategies.PreferConsistent)


    val rowRDD = rdd.map(line => {

        val value = line.value()
        val  publicJson: JSONObject =JSON.parseObject(value);
      if (publicJson.containsKey("urlTime")){
          val value = publicJson.getString("urlTime")

      }
        publicJson.toString

    })
    val aa = rowRDD.count()
    val rs = rowRDD.collect().toList
    println(rs)
    println(aa)
  }




}

你可能感兴趣的:(spark,kafka)