数据湖(十九):SQL API 读取Kafka数据实时写入Iceberg表

2022-07-24 02:13:14 浏览数 (1)

SQL API 读取Kafka数据实时写入Iceberg表

从Kafka中实时读取数据写入到Iceberg表中,操作步骤如下:

一、首先需要创建对应的Iceberg表

代码语言:javascript复制
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tblEnv = StreamTableEnvironment.create(env);
env.enableCheckpointing(1000);
//1.创建Catalog
tblEnv.executeSql("CREATE CATALOG hadoop_iceberg WITH ("  
        "'type'='iceberg',"  
        "'catalog-type'='hadoop',"  
        "'warehouse'='hdfs://mycluster/flink_iceberg')");
//2.创建iceberg表 flink_iceberg_tbl
tblEnv.executeSql("create table hadoop_iceberg.iceberg_db.flink_iceberg_tbl3(id int,name string,age int,loc string) partitioned by (loc)");

二、编写代码读取Kafka数据实时写入Iceberg

代码语言:javascript复制
public class ReadKafkaToIceberg {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tblEnv = StreamTableEnvironment.create(env);
        env.enableCheckpointing(1000);

        /**
         * 1.需要预先创建 Catalog 及Iceberg表
         */
        //1.创建Catalog
        tblEnv.executeSql("CREATE CATALOG hadoop_iceberg WITH ("  
                "'type'='iceberg',"  
                "'catalog-type'='hadoop',"  
                "'warehouse'='hdfs://mycluster/flink_iceberg')");

        //2.创建iceberg表 flink_iceberg_tbl
//        tblEnv.executeSql("create table hadoop_iceberg.iceberg_db.flink_iceberg_tbl3(id int,name string,age int,loc string) partitioned by (loc)");

        //3.创建 Kafka Connector,连接消费Kafka中数据
        tblEnv.executeSql("create table kafka_input_table("  
                " id int,"  
                " name varchar,"  
                " age int,"  
                " loc varchar"  
                ") with ("  
                " 'connector' = 'kafka',"  
                " 'topic' = 'flink-iceberg-topic',"  
                " 'properties.bootstrap.servers'='node1:9092,node2:9092,node3:9092',"  
                " 'scan.startup.mode'='latest-offset',"  
                " 'properties.group.id' = 'my-group-id',"  
                " 'format' = 'csv'"  
                ")");

        //4.配置 table.dynamic-table-options.enabled
        Configuration configuration = tblEnv.getConfig().getConfiguration();
        // 支持SQL语法中的 OPTIONS 选项
        configuration.setBoolean("table.dynamic-table-options.enabled", true);

        //5.写入数据到表 flink_iceberg_tbl3
        tblEnv.executeSql("insert into hadoop_iceberg.iceberg_db.flink_iceberg_tbl3 select id,name,age,loc from kafka_input_table");

        //6.查询表数据
        TableResult tableResult = tblEnv.executeSql("select * from hadoop_iceberg.iceberg_db.flink_iceberg_tbl3 /*  OPTIONS('streaming'='true', 'monitor-interval'='1s')*/");
        tableResult.print();
    }
}

启动以上代码,向Kafka topic中生产如下数据:

代码语言:javascript复制
1,zs,18,beijing
2,ls,19,shanghai
3,ww,20,beijing
4,ml,21,shanghai

我们可以看到控制台上有对应实时数据输出,查看对应的Icberg HDFS目录,数据写入成功。

0 人点赞