Iceberg1.4.2 java 表管理(DDL和DML)操作

数据湖目前最大的问题是缺少数据治理能力,像元数据管理,表管理等能力都没有,一般这种管理需要通过web应用进行管理。通过可视化进行管理,目前像iceberg,huidi和delta.io目前主流主要这三种表管理格式,只有lceberg提供java API进行表的创建,修改展示,也可以通过java写入数据和查询数据。既然是一种开放的表管理格式,那就不应该依赖hadoop,hive,spark,flink等这些组件。直接用java,go,python等语言可以操作,不应该过度依赖,这一点iceberg做的很好,目标是做一个标准,不是像hudi做成一个数据库,脱离了对数据湖,开放表格式的意义。目前delta.io已经支持iceberg的标准。相信后面数据湖的格式一定是实现统一的标准格式。

下面具体介绍以下基于Iceberg1.4.2的java 操作,通过java 创建表,操作表,写入数据和查询数据。对于组件的依赖版本不同,调试很长时间,解决了版本依赖。这里面有spark,iceberg,minio等。这里主要使用的java api.直接复制下面的依赖,可以把spark去掉。

  
            org.apache.spark
            spark-core_2.13
            3.5.0
        
        
            io.delta
            delta-spark_2.12
            3.0.0
        
        
        
            org.apache.iceberg
            iceberg-core
            1.4.2
        

        
            io.minio
            minio
            8.5.7
        
        
        
            com.amazonaws
            aws-java-sdk-s3
            1.12.620
        
        
            org.apache.hadoop
            hadoop-aws
            3.2.2
        

        
            org.apache.parquet
            parquet-hadoop
            1.13.1
        
        
            org.apache.iceberg
            iceberg-api
            1.4.2
        
        
            org.apache.iceberg
            iceberg-bundled-guava
            1.4.2


        
        
        
            com.fasterxml.jackson.core
            jackson-core
            2.15.3
        
        
            org.apache.iceberg
            iceberg-common
            1.4.2
        
        
        
            org.apache.iceberg
            iceberg-parquet
            1.4.2
        
        
        
            org.apache.iceberg
            iceberg-aws
            1.4.2
        
        
        
            org.apache.iceberg
            iceberg-common
            1.4.2
            runtime
        
        
        
            org.apache.iceberg
            iceberg-arrow
            1.4.2
        
        
        
            org.apache.iceberg
            iceberg-data
            1.4.2
        

针对操作首先需要创建,这个案例采用的是minio 存储,没有用hadoop,配置minio的环境。

  Configuration conf = new Configuration();
        conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
        conf.set("fs.s3a.connection.ssl.enabled", "false");
        conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
        conf.set("fs.s3a.access.key", "minioadmin");
        conf.set("fs.s3a.secret.key", "minioadmin");
        conf.set("fs.s3a.path.style.access", "true");
        conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        conf.set("fs.s3a.fast.upload", "true");


        String warehousePath = "s3a://test/";//minio bucket 路径
        System.out.println(warehousePath);
        HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
        System.out.println(catalog.name());
        TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");

接下来创建表

// 定义表结构schema
        Schema schema = new Schema(
                //Types.NestedField.required(1, "level", Types.StringType.get()),
                //Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
                //Types.NestedField.required(3, "message", Types.StringType.get()),
                //Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
                Types.NestedField.required(1, "id", Types.IntegerType.get()),
                Types.NestedField.required(2, "name", Types.StringType.get()),
                Types.NestedField.required(3, "birth", Types.StringType.get())
        );

// 分区定义(以birth字段按月进行分区)
        PartitionSpec spec = PartitionSpec.builderFor(schema)

                .identity("id")
                .build();

// 数据库名,表名
        HadoopTables tables = new HadoopTables();
      //  TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
       // Map properties = new HashMap();
        //properties.put("engine.hive.enabled", "true");
// 建表
       // Table table = catalog.createTable(name, schema, spec, properties);
      //  System.out.println("创建遍完成");
        // 建数据库,没有就创建有就不管
        if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
            catalog.createNamespace(Namespace.of("/iceberg_db1"));
        }
        Table table = null;
        // 判断表存在不存在
       if(!catalog.tableExists(name)) {
           table = catalog.createTable(name, schema, spec);

       }else{
           System.out.println("表已经存在");
           table=catalog.loadTable(name);
       }

接下来,通过java向iceberg写入数据

public void javaCatalogAppend(Schema schema,Table table)throws IOException {
        //1,构建表,构建插入数据
        GenericRecord record = GenericRecord.create(schema);
        ImmutableList.Builder builder = ImmutableList.builder();
        builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
        builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
        ImmutableList records = builder.build();
        // 2. 将记录写入parquet文件
        System.out.println("table.location(): " + table.location());
        String filepath = table.location() + "/" + UUID.randomUUID().toString();
        OutputFile file = table.io().newOutputFile(filepath);
        DataWriter dataWriter =
                Parquet.writeData(file)
                        .schema(schema)
                        .createWriterFunc(GenericParquetWriter::buildWriter)
                        .overwrite()
                        .withSpec(PartitionSpec.unpartitioned())
                        .build();
        try {
            dataWriter.write(records);
        } finally {
            dataWriter.close();
        }

        // 3. 将文件写入table中
        DataFile dataFile = dataWriter.toDataFile();
        table.newAppend().appendFile(dataFile).commit();
    }

通过Java查询数据CatalogScan

private void javaCatalogScan(Table table) {
        IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);

        CloseableIterable records = scanBuilder.build();

        for (Record r : records) {
            System.out.print(r.get(0));
            System.out.print("|");
            System.out.print(r.get(1));
            System.out.print("|");
            System.out.print(r.get(2));
            System.out.println();
        }
    }

附带完整例子

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.IcebergGenerics;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.hadoop.HadoopCatalog;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.types.Types;

import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.UUID;

public class icebergapi {
    public static void main(String[] args)
            throws IOException, NoSuchAlgorithmException, InvalidKeyException {
        Configuration conf = new Configuration();
        conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
        conf.set("fs.s3a.connection.ssl.enabled", "false");
        conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
        conf.set("fs.s3a.access.key", "minioadmin");
        conf.set("fs.s3a.secret.key", "minioadmin");
        conf.set("fs.s3a.path.style.access", "true");
        conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        conf.set("fs.s3a.fast.upload", "true");


        String warehousePath = "s3a://test/";//minio bucket 路径
        System.out.println(warehousePath);
        HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
        System.out.println(catalog.name());
        TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");

        // 定义表结构schema
        Schema schema = new Schema(
                //Types.NestedField.required(1, "level", Types.StringType.get()),
                //Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
                //Types.NestedField.required(3, "message", Types.StringType.get()),
                //Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
                Types.NestedField.required(1, "id", Types.IntegerType.get()),
                Types.NestedField.required(2, "name", Types.StringType.get()),
                Types.NestedField.required(3, "birth", Types.StringType.get())
        );

// 分区定义(以birth字段按月进行分区)
        PartitionSpec spec = PartitionSpec.builderFor(schema)

                .identity("id")
                .build();

// 数据库名,表名
        HadoopTables tables = new HadoopTables();
      //  TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
       // Map properties = new HashMap();
        //properties.put("engine.hive.enabled", "true");
// 建表
       // Table table = catalog.createTable(name, schema, spec, properties);
      //  System.out.println("创建遍完成");
        // 建数据库,没有就创建有就不管
        if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
            catalog.createNamespace(Namespace.of("/iceberg_db1"));
        }
        Table table = null;
        // 判断表存在不存在
       if(!catalog.tableExists(name)) {
           table = catalog.createTable(name, schema, spec);

       }else{
           System.out.println("表已经存在");
           table=catalog.loadTable(name);
       }
        icebergapi api=new icebergapi();
       //写入数据
        api.javaCatalogAppend(schema,table);
        //查询数据
        api.javaCatalogScan(table);
    }



    public void javaCatalogAppend(Schema schema,Table table)throws IOException {
        //1,构建表,构建插入数据
        GenericRecord record = GenericRecord.create(schema);
        ImmutableList.Builder builder = ImmutableList.builder();
        builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
        builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
        ImmutableList records = builder.build();
        // 2. 将记录写入parquet文件
        System.out.println("table.location(): " + table.location());
        String filepath = table.location() + "/" + UUID.randomUUID().toString();
        OutputFile file = table.io().newOutputFile(filepath);
        DataWriter dataWriter =
                Parquet.writeData(file)
                        .schema(schema)
                        .createWriterFunc(GenericParquetWriter::buildWriter)
                        .overwrite()
                        .withSpec(PartitionSpec.unpartitioned())
                        .build();
        try {
            dataWriter.write(records);
        } finally {
            dataWriter.close();
        }

        // 3. 将文件写入table中
        DataFile dataFile = dataWriter.toDataFile();
        table.newAppend().appendFile(dataFile).commit();
    }
    private void javaCatalogScan(Table table) {
        IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);

        CloseableIterable records = scanBuilder.build();

        for (Record r : records) {
            System.out.print(r.get(0));
            System.out.print("|");
            System.out.print(r.get(1));
            System.out.print("|");
            System.out.print(r.get(2));
            System.out.println();
        }
    }
}

你可能感兴趣的:(java,开发语言)