Flink读取Kafka数据写入IceBerg(HiveCatalog)

Readme

java8 flink1.13 kafka3 iceberg0.13

链路:Kafka -> Flink-> IceBerg(HiveCatalog)

代码

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.CatalogProperties;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.flink.CatalogLoader;
import org.apache.iceberg.flink.TableLoader;
import org.apache.iceberg.flink.sink.FlinkSink;
import org.json.JSONObject;

import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.HashMap;
import java.util.Properties;

public class FlinkIcebergStreamingExample {

    private static final String KFK_BOOTSTRAP_SERVERS = "例如121.0.0.1:9092";
    private static final String KFK_GROUP_ID = "例如groupid";
    private static final String KFK_AUTO_OFFSET_RESET = "例如earliest";
    private static final String ICE_WAREHOUSE_LOCATION = "例如hdfs://cluster/user/hive/warehouse";
    private static final String ICE_URI = "例如thrift://metastore.xx:9083";
    private static final String ICE_CATALOG_IMPL = "例如org.apache.iceberg.hive.HiveCatalog";
    private static final String KAFKA_SOURCE = "来源TOPIC";
    private static final int TARGET_FIELD_NUM = 3; // 假设有3个字段
    private static final String FIELD1 = "第一个字段名";
    private static final String FIELD2 = "第二个字段名";
    private static final String FIELD3 = "第三个字段名";
    private static final String ICE_DB = "目标数据库";
    private static final String ICE_TABLE = "目标表表名";


    private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");

    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStream kafkaStream = env.addSource(new FlinkKafkaConsumer<>(
                KAFKA_SOURCE,
                new SimpleStringSchema(),
                configureKafka()
        )).name("Kafka Source");

        DataStream rowDataStream = kafkaStream.map(jsonMessage -> {
            JSONObject jsonObject = new JSONObject(jsonMessage);
            GenericRowData rowdata = new GenericRowData(TARGET_FIELD_NUM);
            // 数值字段样例
            rowdata.setField(0, jsonObject.optLong(FIELD1));
            // 文本字段样例
            rowdata.setField(1, StringData.fromString(jsonObject.optString(FIELD2)));
            // 时间戳字段样例 减去8小时,因为kafka的时间是UTC时间,而iceberg的时间是UTC+8的时间
            rowdata.setField(2, TimestampData.fromLocalDateTime(
                    LocalDateTime.parse(jsonObject.optString(FIELD3).substring(0, 19), TIMESTAMP_FORMATTER).minusHours(8)
            ));
            return rowdata;
        });

        FlinkSink.forRowData(rowDataStream).tableLoader(configureIceberg(ICE_WAREHOUSE_LOCATION,ICE_URI,ICE_CATALOG_IMPL,ICE_DB,ICE_TABLE)).append();
        env.execute("Flink Kafka to Iceberg Streaming Job");
    }

    private static Properties configureKafka() {
        return new Properties() {{
            setProperty("bootstrap.servers", KFK_BOOTSTRAP_SERVERS);
            setProperty("group.id", KFK_GROUP_ID);
            setProperty("auto.offset.reset", KFK_AUTO_OFFSET_RESET);
        }};
    }

    private static TableLoader configureIceberg(String s1,String s2,String s3,String s4,String s5) {
        return TableLoader.fromCatalog(CatalogLoader.hive("iceberg", new Configuration(), new HashMap() {{
            put(CatalogProperties.WAREHOUSE_LOCATION, s1);
            put(CatalogProperties.URI, s2);
            put(CatalogProperties.CATALOG_IMPL, s3);
        }}), TableIdentifier.of(s4, s5));
    }

}

Maven依赖

    
        8
        8
        1.8
        ${java.version}
        ${java.version}
        1.13.0
        2.12
        3.1.3
        0.13.2
        2.13.3 
    
    
        
            org.apache.flink
            flink-streaming-java_${scala.version}
            ${flink.version}
        

        
            org.apache.flink
            flink-connector-kafka_${scala.version}
            ${flink.version}
        

        
            org.apache.flink
            flink-clients_${scala.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-json
            ${flink.version}
        

        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
            org.projectlombok
            lombok
            1.18.20
        
        
            org.apache.flink
            flink-table-api-java-bridge_${scala.version}
            ${flink.version}
        

        
            org.apache.flink
            flink-table-planner-blink_${scala.version}
            ${flink.version}
        


        
        
            com.google.guava
            guava
            29.0-jre
        

        
            org.apache.flink
            flink-java
            ${flink.version}
        

        
        
            org.apache.iceberg
            iceberg-flink-1.13
            ${iceberg.version}
        

        
            org.apache.iceberg
            iceberg-hive-metastore
            ${iceberg.version}
        

        
            org.apache.iceberg
            iceberg-core
            ${iceberg.version}
        
        
            org.apache.iceberg
            iceberg-parquet
            ${iceberg.version}
        
        
            org.json
            json
            20210307
        
        
        
            org.slf4j
            slf4j-api
            1.7.25
        

        
            org.slf4j
            slf4j-log4j12
            1.7.25
        

        
            org.apache.logging.log4j
            log4j-to-slf4j
            2.14.0
        

        
            org.apache.flink
            flink-connector-jdbc_${scala.version}
            ${flink.version}
        

        
            org.apache.thrift
            libthrift
            0.20.0
        
        
            org.apache.hive
            hive-metastore
            ${hadoop.version}
        
    

你可能感兴趣的:(大数据,flink,iceberg)