当前位置: 首页 > article >正文

二百七十四、Kettle——ClickHouse中对错误数据表中进行数据修复(实时)

一、目的

在完成数据清洗、错误数据之后,需要根据修复规则对错误数据进行修复

二、Hive中原有代码

insert into table  hurys_db.dwd_queue  partition(day)
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else a2.avg_queue_len  end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else a2.avg_queue_head end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else a2.avg_queue_tail end as queue_tail,
       a3.day
from hurys_db.dwd_queue_error as a3
right join (select
       a1.device_no,
       a1.create_time,
       a1.lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len),2)       avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,7),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,14),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,21),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
) as a1
group by a1.device_no, a1.create_time, a1.lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.create_time and a3.lane_no=a2.lane_no
where a3.day='2024-09-04'
;

三、ClickHouse中现有代码

--43、静态排队字段数据修复
    --修复策略:使用前三周同期数据取平均进行修复
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else cast(a2.avg_queue_len   as Decimal(10,2)) end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head  as Decimal(10,2)) end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail  as Decimal(10,2)) end as queue_tail,
       cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
       device_no,
       start_time,
       lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len) ,2)      avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;

注意:Hive中原有SQL语句和ClickHouse现有SQL语句很大不同

四、Kettle任务

方框标记是修复数据,高频率执行

下面其他几个是修复记录任务

4.1 newtime

4.2 替换NULL值

4.3 clickhouse输入

select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else cast(a2.avg_queue_len   as Decimal(10,2)) end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head  as Decimal(10,2)) end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail  as Decimal(10,2)) end as queue_tail,
       cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
       device_no,
       start_time,
       lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len) ,2)      avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;

4.4 字段选择

4.5 clickhouse输出

4.6 执行SQL脚本

由于是对每一天的错误进行修复,因此每次执行后需要先删除这个分区的错误数据。因为每次执行清洗后都会先执行错误数据任务!

4.7 执行任务

4.8 海豚调度

注意在DWD层静态排队数据清洗、DWD层静态排队错误数据之后


http://www.kler.cn/a/374146.html

相关文章:

  • api开发如何在代码中使用京东商品详情接口的参数?
  • C# 对象和类型(结构)
  • [Git] git cherry-pick
  • flink的EventTime和Watermark
  • Visio 画阀门 符号 : 电动阀的画法
  • 微信小程序map组件所有markers展示在视野范围内
  • Spark集群管理脚本详解
  • 【数据结构-邻项消除】力扣2211. 统计道路上的碰撞次数
  • UDP-鼠李糖合成酶基因的克隆与鉴定-文献精读76
  • 系统学习CFD,常见收敛问题、及如何与机器学习相结合
  • zynq PS端跑Linux响应中断
  • windows下安装python库wordCloud报错
  • 【PGCCC】Postgresql BgWriter 原理
  • Java实现数据去重的几种方案及其去重原理
  • 【skywalking】监控 Spring Cloud Gateway 数据
  • flask框架用法介绍(一)
  • 从零学习大模型(十)-----剪枝基本概念
  • 【SSE】前端vue3使用SSE,EventSource携带请求头
  • H2 Database IDEA 源码 DEBUG 环境搭建
  • VuePress文档初始化请求过多问题探讨
  • 设计模式07-结构型模式(装饰模式/外观模式/代理模式/Java)
  • HTB:Cicada[WriteUP]
  • 【Linux-进程间通信】匿名管道的应用-进程池实现+命名管道的应用ClientServer通信
  • 手机收银云进销存管理软件,商品档案Excel格式批量导入导出,一键导入Excel的商品档案
  • 跨可用区的集群k8s的基本操作和配置理解
  • 【开源免费】基于SpringBoot+Vue.JS网上订餐系统(JAVA毕业设计)