使用 Avro 序列化器将 Spark Structured Streaming 数据发送到 Confluent Kafka

import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroSerializer}

import org.apache.avro.Schema

import org.apache.avro.generic.GenericRecord

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}

import org.apache.kafka.common.serialization.StringSerializer

import org.apache.spark.sql.{DataFrame, ForeachWriter}

import org.apache.spark.sql.streaming.StreamingQuery

import org.apache.spark.sql.types.StructType

 

// 定义 Avro 的 Schema,这里假设发送的数据包含一个 "name" 字段和一个 "age" 字段

val avroSchema = new Schema.Parser().parse("""

{

  "type": "record",

  "namespace": "example.avro",

  "name": "User",

  "fields": [

    {"name": "name", "type": "string"},

    {"name": "age", "type": "int"}

  ]

}

""")

 

// 定义 Avro 序列化器的配置信息

val avroSerializerConfig = Map[String, String](

  AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> "http://localhost:8081"

)

 

// 定义 Kafka 生产者的配置信息

val kafkaProducerConfig = Map[String, Object](

  ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> "localhost:9092",

  ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[StringSerializer],

  ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[KafkaAvroSerializer],

  AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> "http://localhost:8081"

)

 

// 定义一个自定义的 ForeachWriter,用于将数据发送到 Kafka

class KafkaAvroForeachWriter(topic: String) extends ForeachWriter[GenericRecord] {

  var producer: KafkaProducer[String, GenericRecord] = _

 

  override def open(partitionId: Long, epochId: Long): Boolean = {

    producer = new KafkaProducer[String, GenericRecord](kafkaProducerConfig.asJava)

    true

  }

 

  override def process(record: GenericRecord): Unit = {

    val producerRecord = new ProducerRecord[String, GenericRecord](topic, record)

    producer.send(producerRecord)

  }

 

  override def close(errorOrNull: Throwable): Unit = {

    producer.close()

  }

}

 

// 定义 Structured Streaming 查询

val df: DataFrame = spark.readStream

  .format("kafka")

  .option("kafka.bootstrap.servers", "localhost:9092")

  .option("subscribe", "input_topic")

  .option("startingOffsets", "earliest")

  .load()

  .selectExpr("CAST(value AS STRING)") // 假设数据格式为 JSON

  .select(from_json($"value", avroSchema).as("data"))

  .select("data.*")

 

val query: StreamingQuery = df.writeStream

  .foreach(new KafkaAvroForeachWriter("output_topic"))

  .start()

 

query.awaitTermination()

import org.apache.spark.sql.Row

import org.apache.spark.sql.avro._

import org.apache.spark.sql.ForeachWriter

import org.apache.avro.generic.GenericRecord

 

class KafkaForeachWriter(topic: String, schemaString: String, kafkaParams: Map[String, Object]) extends ForeachWriter[Row] {

 

  var kafkaProducer: KafkaProducer[GenericRecord, GenericRecord] = _

  var schema: Schema = _

 

  def open(partitionId: Long, version: Long): Boolean = {

    // Setup Kafka Producer

    val kafkaConf = new KafkaProducerConfig(kafkaParams)

    kafkaProducer = new KafkaProducer[GenericRecord, GenericRecord](kafkaConf)

 

    // Setup Avro schema

    schema = new Schema.Parser().parse(schemaString)

 

    true

  }

 

  def process(row: Row): Unit = {

    // Convert Row to GenericRecord

    val genericRecord = AvroSerializer.rowToGenericRecord(row, schema)

 

    // Create Kafka record

    val record = new ProducerRecord[GenericRecord, GenericRecord](topic, genericRecord, genericRecord)

 

    // Send to Kafka

    kafkaProducer.send(record)

  }

 

  def close(errorOrNull: Throwable): Unit = {

    // Close Kafka Producer

    kafkaProducer.close()

  }

}

 

Caused by: java.io.NotSerializableException: org.apache.avro.Schema$RecordSchema Serialization stack: - object not serializable (class: org.apache.avro.Schema$RecordSchema, value: {"type":"record","name":"topLevelRecord","fields":[{"name":"avro_value","type":{"type":"record","name":"avro_value","namespace":"topLevelRecord","fields":[{"name":"tb_name","type":"string"},{"name":"customer_id","type":["string","null"]}]}}]})

class AvroRecordForeachWriter(schemaString: String, kafkaParams: Map[String, Object], topic: String)

    extends ForeachWriter[Row] with Serializable {

 

  private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _

  private var avroSchema: Schema = _

 

  override def open(partitionId: Long, version: Long): Boolean = {

    producer = new KafkaProducer[Array[Byte], Array[Byte]](kafkaParams.asJava)

    avroSchema = new Schema.Parser().parse(schemaString)

    true

  }

 

  override def process(value: Row): Unit = {

    val genericRecord = new GenericData.Record(avroSchema)

    for (i <- 0 until value.length) {

      val field = avroSchema.getFields.get(i)

      val fieldName = field.name()

      val fieldType = field.schema().getType

      val fieldValue = value.get(i)

      if (fieldType == Schema.Type.ARRAY) {

        val avroArray = new GenericData.Array[AnyRef](field.schema(), fieldValue.asInstanceOf[Seq[_]].asJava)

        genericRecord.put(fieldName, avroArray)

      } else {

        genericRecord.put(fieldName, fieldValue)

      }

    }

    val writer = new ByteArrayOutputStream()

    val encoder = EncoderFactory.get().binaryEncoder(writer, null)

    val datumWriter = new GenericDatumWriter[GenericRecord](avroSchema)

    datumWriter.write(genericRecord, encoder)

    encoder.flush()

    writer.close()

    val message = new ProducerRecord[Array[Byte], Array[Byte]](topic, writer.toByteArray)

    producer.send(message)

  }

 

  override def close(errorOrNull: Throwable): Unit = {

    producer.close()

  }

}

 

你可能感兴趣的:(kafka)