hudi每次数据写入时都会生成一个时间戳,用于表示数据写入的时间,基于该特性,在进行数据查询时可使用该时间对hudi中数据进行查询。
使用flink引擎查询时可指定如下参数
- ‘read.start-commit’ = ‘20220617160237493’
- ‘read.end-start’ = ‘20220617160826396’
当进行数据查询时,可通过上述参数查询指定时间段写入的数据。
参考示例:
- 查询20220617160237493 – 20220617160826396时间段数据
CREATE TABLE orders_detail_hudi(
id INT,
num INT,
name STRING,
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'hudi',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'false',
'read.start-commit' = '20220617160237493',
'read.end-start' = '20220617160826396',
'compaction.async.enabled' = 'false',
'path' = 'hdfs://bigdata:9000/user/hive/warehouse/huditest_1/orders_detail_hudi'
);
- 查询20220617160237493到当前时刻数据
CREATE TABLE orders_detail_hudi(
id INT,
num INT,
name STRING,
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'hudi',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'false',
'read.start-commit' = '20220617160237493',
'compaction.async.enabled' = 'false',
'path' = 'hdfs://bigdata:9000/user/hive/warehouse/huditest_1/orders_detail_hudi'
);
- 使用增量数据读取方式,查询从20220617160237493开始的数据
CREATE TABLE orders_detail_hudi(
id INT,
num INT,
name STRING,
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'hudi',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'read.start-commit' = '20220617160237493',
'compaction.async.enabled' = 'false',
'path' = 'hdfs://bigdata:9000/user/hive/warehouse/huditest_1/orders_detail_hudi'
);