import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroSerializer}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.sql.{DataFrame, ForeachWriter}
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.StructType
// 定义 Avro 的 Schema,这里假设发送的数据包含一个 "name" 字段和一个 "age" 字段
val avroSchema = new Schema.Parser().parse("""
{
"type": "record",
"namespace": "example.avro",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "age", "type": "int"}
]
}
""")
// 定义 Avro 序列化器的配置信息
val avroSerializerConfig = Map[String, String](
AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> "http://localhost:8081"
)
// 定义 Kafka 生产者的配置信息
val kafkaProducerConfig = Map[String, Object](
ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> "localhost:9092",
ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[StringSerializer],
ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[KafkaAvroSerializer],
AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> "http://localhost:8081"
)
// 定义一个自定义的 ForeachWriter,用于将数据发送到 Kafka
class KafkaAvroForeachWriter(topic: String) extends ForeachWriter[GenericRecord] {
var producer: KafkaProducer[String, GenericRecord] = _
override def open(partitionId: Long, epochId: Long): Boolean = {
producer = new KafkaProducer[String, GenericRecord](kafkaProducerConfig.asJava)
true
}
override def process(record: GenericRecord): Unit = {
val producerRecord = new ProducerRecord[String, GenericRecord](topic, record)
producer.send(producerRecord)
}
override def close(errorOrNull: Throwable): Unit = {
producer.close()
}
}
// 定义 Structured Streaming 查询
val df: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "input_topic")
.option("startingOffsets", "earliest")
.load()
.selectExpr("CAST(value AS STRING)") // 假设数据格式为 JSON
.select(from_json($"value", avroSchema).as("data"))
.select("data.*")
val query: StreamingQuery = df.writeStream
.foreach(new KafkaAvroForeachWriter("output_topic"))
.start()
query.awaitTermination()
import org.apache.spark.sql.Row
import org.apache.spark.sql.avro._
import org.apache.spark.sql.ForeachWriter
import org.apache.avro.generic.GenericRecord
class KafkaForeachWriter(topic: String, schemaString: String, kafkaParams: Map[String, Object]) extends ForeachWriter[Row] {
var kafkaProducer: KafkaProducer[GenericRecord, GenericRecord] = _
var schema: Schema = _
def open(partitionId: Long, version: Long): Boolean = {
// Setup Kafka Producer
val kafkaConf = new KafkaProducerConfig(kafkaParams)
kafkaProducer = new KafkaProducer[GenericRecord, GenericRecord](kafkaConf)
// Setup Avro schema
schema = new Schema.Parser().parse(schemaString)
true
}
def process(row: Row): Unit = {
// Convert Row to GenericRecord
val genericRecord = AvroSerializer.rowToGenericRecord(row, schema)
// Create Kafka record
val record = new ProducerRecord[GenericRecord, GenericRecord](topic, genericRecord, genericRecord)
// Send to Kafka
kafkaProducer.send(record)
}
def close(errorOrNull: Throwable): Unit = {
// Close Kafka Producer
kafkaProducer.close()
}
}
Caused by: java.io.NotSerializableException: org.apache.avro.Schema$RecordSchema Serialization stack: - object not serializable (class: org.apache.avro.Schema$RecordSchema, value: {"type":"record","name":"topLevelRecord","fields":[{"name":"avro_value","type":{"type":"record","name":"avro_value","namespace":"topLevelRecord","fields":[{"name":"tb_name","type":"string"},{"name":"customer_id","type":["string","null"]}]}}]})
class AvroRecordForeachWriter(schemaString: String, kafkaParams: Map[String, Object], topic: String)
extends ForeachWriter[Row] with Serializable {
private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
private var avroSchema: Schema = _
override def open(partitionId: Long, version: Long): Boolean = {
producer = new KafkaProducer[Array[Byte], Array[Byte]](kafkaParams.asJava)
avroSchema = new Schema.Parser().parse(schemaString)
true
}
override def process(value: Row): Unit = {
val genericRecord = new GenericData.Record(avroSchema)
for (i <- 0 until value.length) {
val field = avroSchema.getFields.get(i)
val fieldName = field.name()
val fieldType = field.schema().getType
val fieldValue = value.get(i)
if (fieldType == Schema.Type.ARRAY) {
val avroArray = new GenericData.Array[AnyRef](field.schema(), fieldValue.asInstanceOf[Seq[_]].asJava)
genericRecord.put(fieldName, avroArray)
} else {
genericRecord.put(fieldName, fieldValue)
}
}
val writer = new ByteArrayOutputStream()
val encoder = EncoderFactory.get().binaryEncoder(writer, null)
val datumWriter = new GenericDatumWriter[GenericRecord](avroSchema)
datumWriter.write(genericRecord, encoder)
encoder.flush()
writer.close()
val message = new ProducerRecord[Array[Byte], Array[Byte]](topic, writer.toByteArray)
producer.send(message)
}
override def close(errorOrNull: Throwable): Unit = {
producer.close()
}
}