二百七十三、Kettle——ClickHouse中增量导入数据准确性统计表数据(1天1次)
一、目的
在数据质量模块,需要对原始数据的准确性进行统计
二、Hive中原有代码
2.1 表结构
--42、数据准确性统计表 dwd_data_accuracy create table if not exists hurys_db.dwd_data_accuracy( data_type int comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源', device_no string comment '设备编号', field_name string comment '字段名', data_unreasonable_rate float comment '数据不合理率', data_null_rate float comment '数据空值率' ) comment '数据准确性统计表' partitioned by (day string) stored as orc ;
2.2 SQL代码
insert overwrite table hurys_db.dwd_data_accuracy partition(day) select t1.data_type, t1.device_no, t1.field_name, round((sum(case when t1.field_value is not null then 1 else 0 end)/t2.count_device_all),2) data_unreasonable_rate, round((sum(case when t1.field_value is null then 1 else 0 end)/t2.count_device_all),2) data_null_rate , t1.day from hurys_db.dwd_data_clean_record_queue as t1 left join (select device_no, day, count(device_no) count_device_all from hurys_db.ods_queue where day='2024-09-04' group by device_no, day ) as t2 on t2.device_no=t1.device_no and t2.day=t1.day where t2.count_device_all is not null group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day
三、ClickHouse中现有代码
3.1 表结构
--42、八大类基础数据准确性统计表(长期存储) create table if not exists hurys_jw.dwd_data_accuracy( data_type Int32 comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源', device_no String comment '设备编号', field_name String comment '字段名', data_unreasonable_rate Decimal(10, 6) comment '数据不合理率', data_null_rate Decimal(10, 6) comment '数据空值率', day Date comment '日期' ) ENGINE = MergeTree PARTITION BY day PRIMARY KEY day ORDER BY day SETTINGS index_granularity = 8192;
3.2 SQL代码
--静态排队 select data_type, device_no, field_name, round(count_field_unreasonable / count_device_all,6) data_unreasonable_rate, round(count_field_null / count_device_all,6) data_null_rate, cast(day as String) day from (select t1.data_type, t1.device_no, t1.field_name, sum(case when field_name is not null then 1 else 0 end) count_field_unreasonable, sum(case when field_name is null then 1 else 0 end) count_field_null, t2.count_device_all, t1.day from hurys_jw.dwd_data_clean_record_queue as t1 left join (select device_no, DATE(create_time) day, count(device_no) count_device_all from hurys_jw.ods_queue where day='2024-10-22' group by device_no, day ) as t2 on t2.device_no=t1.device_no and t2.day=t1.day where t2.count_device_all > 0 group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day) ;
3.3 Kettle任务
3.3.1 newtime
data:image/s3,"s3://crabby-images/65e47/65e47116054e2d8f843477816f0addd8f3ef535e" alt=""
3.3.2 替换NULL值
data:image/s3,"s3://crabby-images/07965/07965000f82679cb1b1aeaf3c949b6b07065f0f8" alt=""
3.3.3 静态排队
select
data_type,
device_no,
field_name,
round(count_field_unreasonable / count_device_all,6) data_unreasonable_rate,
round(count_field_null / count_device_all,6) data_null_rate,
cast(day as String) day
from (select
t1.data_type,
t1.device_no,
t1.field_name,
sum(case when field_name is not null then 1 else 0 end) count_field_unreasonable,
sum(case when field_name is null then 1 else 0 end) count_field_null,
t2.count_device_all,
t1.day
from hurys_jw.dwd_data_clean_record_queue as t1
left join (select
device_no,
DATE(create_time) day,
count(device_no) count_device_all
from hurys_jw.ods_queue
where day > ?
group by device_no, day
) as t2
on t2.device_no=t1.device_no and t2.day=t1.day
where t2.count_device_all > 0
group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day)
;
其他clickhouse输入控件代码类似
3.3.4 字段选择
data:image/s3,"s3://crabby-images/57954/57954c2a2454eb330484854933dc3897250016f1" alt=""
3.3.5 clickhouse输出
data:image/s3,"s3://crabby-images/8d41b/8d41b0b7c582e59b9108723cd67a1efdd1cae655" alt=""
3.3.6 执行任务
data:image/s3,"s3://crabby-images/a4dbb/a4dbb9cfa684666a034997d055d0ae85a8a315ec" alt=""
3.3.7 海豚调度(1天1次)
搞定!就是Hive中原有SQL语句和ClickHouse现有SQL语句很大不同,改造起来有点烦,尤其碰上管卡!