二百七十六、ClickHouse——Hive和ClickHouse非常不同的DWS指标数据SQL语句
一、目的
在完成数据之后对业务指标进行分析,Hive和ClickHouseSQL真不一样
二、部分业务指标表
2.1 统计数据流量表1天周期
2.1.1 Hive中原有代码
2.1.1.1 Hive中建表语句
--1、统计数据流量表——动态分区——1天周期 create table if not exists hurys_db.dws_statistics_volume_1day( device_no string comment '设备编号', scene_name string comment '场景名称', lane_no int comment '车道编号', lane_direction string comment '车道流向', section_no int comment '断面编号', device_direction string comment '雷达朝向', sum_volume_day int comment '每天总流量', week_day string comment '周几', month string comment '月份' ) comment '统计数据流量表——动态分区——1天周期' partitioned by (day string) stored as orc ;
2.1.1.2 Hive中SQL语句
--动态加载数据 insert overwrite table hurys_db.dws_statistics_volume_1day partition(day) select dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, sum(volume_sum) sum_volume_day, case when pmod(datediff(create_time,'2023-11-27') + 1,7) = 1 then '周一' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 2 then '周二' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 3 then '周三' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 4 then '周四' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 5 then '周五' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 6 then '周六' else '周日' end as week_day, substr(day,1,7) month, day from hurys_db.dwd_statistics as dwd_st right join hurys_db.dwd_radar_lane as dwd_rl on dwd_rl.device_no=dwd_st.device_no and dwd_rl.lane_no=dwd_st.lane_no right join hurys_db.dwd_device_scene as dwd_ds on dwd_ds.device_no=dwd_st.device_no right join hurys_db.dwd_scene as dwd_sc on dwd_sc.scene_id = dwd_ds.scene_id right join hurys_db.dwd_radar_config as dwd_rc on dwd_rc.device_no=dwd_st.device_no where dwd_st.create_time is not null and dwd_st.day='2024-09-05' group by dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, case when pmod(datediff(create_time,'2023-11-27') + 1,7) = 1 then '周一' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 2 then '周二' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 3 then '周三' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 4 then '周四' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 5 then '周五' when pmod(datediff(create_time,'2023-11-27') + 1,7) = 6 then '周六' else '周日' end, day ;
2.1.2 ClickHouse中现有代码
2.1.2.1 ClickHouse中表结构
--1、统计数据流量表——动态分区——1天周期 create table if not exists hurys_jw.dws_statistics_volume_1day( device_no String comment '设备编号', scene_name String comment '场景名称', lane_no Nullable(Int32) comment '车道编号', lane_direction Nullable(String) comment '车道流向', section_no Nullable(Int32) comment '断面编号', device_direction Nullable(String) comment '雷达朝向', sum_volume_day Nullable(Int32) comment '每天总流量', week_day Nullable(String) comment '周几', month Nullable(String) comment '月份', day Date comment '日期' ) ENGINE = MergeTree PARTITION BY day PRIMARY KEY day ORDER BY day SETTINGS index_granularity = 8192;
2.1.2.2 ClickHouse中SQL语句
--动态加载数据 select dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, sum(volume_sum) sum_volume_day, case when toDayOfWeek(create_time) = 1 then '周一' when toDayOfWeek(create_time) = 2 then '周二' when toDayOfWeek(create_time) = 3 then '周三' when toDayOfWeek(create_time) = 4 then '周四' when toDayOfWeek(create_time) = 5 then '周五' when toDayOfWeek(create_time) = 6 then '周六' when toDayOfWeek(create_time) = 7 then '周日' end as week_day, concat(toString(toYear(dwd_st.day)), '-', lpad(toString(toMonth(dwd_st.day)), 2, '0')) AS month, cast(dwd_st.day as String) day from hurys_jw.dwd_statistics as dwd_st right join hurys_jw.dwd_radar_lane as dwd_rl on dwd_rl.device_no=dwd_st.device_no and dwd_rl.lane_no=dwd_st.lane_no right join hurys_jw.dwd_device_scene as dwd_ds on dwd_ds.device_no=dwd_st.device_no right join hurys_jw.dwd_scene as dwd_sc on dwd_sc.scene_id = dwd_ds.scene_id right join hurys_jw.dwd_radar_config as dwd_rc on dwd_rc.device_no=dwd_st.device_no where dwd_st.create_time is not null and dwd_st.lane_no is not null and dwd_st.day >= ? group by dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, case when toDayOfWeek(create_time) = 1 then '周一' when toDayOfWeek(create_time) = 2 then '周二' when toDayOfWeek(create_time) = 3 then '周三' when toDayOfWeek(create_time) = 4 then '周四' when toDayOfWeek(create_time) = 5 then '周五' when toDayOfWeek(create_time) = 6 then '周六' when toDayOfWeek(create_time) = 7 then '周日' end, dwd_st.day ;
2.2 统计数据流量表5分钟周期
2.2.1 Hive中原有代码
2.2.1.1 Hive中建表语句
--5、统计数据流量表——动态分区——5分钟周期 create table if not exists hurys_db.dws_statistics_volume_5min( device_no string comment '设备编号', scene_name string comment '场景名称', lane_no int comment '车道编号', lane_direction string comment '车道流向', section_no int comment '断面编号', device_direction string comment '雷达朝向', sum_volume_5min int comment '每5分钟总流量', start_time timestamp comment '开始时间' ) comment '统计数据流量表——动态分区——5分钟周期' partitioned by (day string) stored as orc ;
2.2.1.2 Hive中SQL语句
--动态加载数据 insert overwrite table hurys_db.dws_statistics_volume_5min partition(day) select dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, sum(volume_sum) sum_volume_5min, case when minute(create_time ) < 5 then concat(substr(create_time, 1, 14), '00:00') when minute(create_time) >=5 and minute(create_time) <10 then concat(substr(create_time, 1, 14), '05:00') when minute(create_time) >=10 and minute(create_time) <15 then concat(substr(create_time, 1, 14), '10:00') when minute(create_time) >=15 and minute(create_time) <20 then concat(substr(create_time, 1, 14), '15:00') when minute(create_time) >=20 and minute(create_time) <25 then concat(substr(create_time, 1, 14), '20:00') when minute(create_time) >=25 and minute(create_time) <30 then concat(substr(create_time, 1, 14), '25:00') when minute(create_time) >=30 and minute(create_time) <35 then concat(substr(create_time, 1, 14), '30:00') when minute(create_time) >=35 and minute(create_time) <40 then concat(substr(create_time, 1, 14), '35:00') when minute(create_time) >=40 and minute(create_time) <45 then concat(substr(create_time, 1, 14), '40:00') when minute(create_time) >=45 and minute(create_time) <50 then concat(substr(create_time, 1, 14), '45:00') when minute(create_time) >=50 and minute(create_time) <55 then concat(substr(create_time, 1, 14), '50:00') else concat(substr(create_time, 1, 14), '55:00') end as start_time, day from hurys_db.dwd_statistics as dwd_st right join hurys_db.dwd_radar_lane as dwd_rl on dwd_rl.device_no=dwd_st.device_no and dwd_rl.lane_no=dwd_st.lane_no right join hurys_db.dwd_device_scene as dwd_ds on dwd_ds.device_no=dwd_st.device_no right join hurys_db.dwd_scene as dwd_sc on dwd_sc.scene_id = dwd_ds.scene_id right join hurys_db.dwd_radar_config as dwd_rc on dwd_rc.device_no=dwd_st.device_no where dwd_st.create_time is not null and dwd_st.day='2024-09-05' group by dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, case when minute(create_time ) < 5 then concat(substr(create_time, 1, 14), '00:00') when minute(create_time) >=5 and minute(create_time) <10 then concat(substr(create_time, 1, 14), '05:00') when minute(create_time) >=10 and minute(create_time) <15 then concat(substr(create_time, 1, 14), '10:00') when minute(create_time) >=15 and minute(create_time) <20 then concat(substr(create_time, 1, 14), '15:00') when minute(create_time) >=20 and minute(create_time) <25 then concat(substr(create_time, 1, 14), '20:00') when minute(create_time) >=25 and minute(create_time) <30 then concat(substr(create_time, 1, 14), '25:00') when minute(create_time) >=30 and minute(create_time) <35 then concat(substr(create_time, 1, 14), '30:00') when minute(create_time) >=35 and minute(create_time) <40 then concat(substr(create_time, 1, 14), '35:00') when minute(create_time) >=40 and minute(create_time) <45 then concat(substr(create_time, 1, 14), '40:00') when minute(create_time) >=45 and minute(create_time) <50 then concat(substr(create_time, 1, 14), '45:00') when minute(create_time) >=50 and minute(create_time) <55 then concat(substr(create_time, 1, 14), '50:00') else concat(substr(create_time, 1, 14), '55:00') end, day ;
2.2.2 ClickHouse中现有代码
2.2.2.1 ClickHouse中表结构
--5、统计数据流量表——动态分区——5分钟周期 create table if not exists hurys_jw.dws_statistics_volume_5min( device_no String comment '设备编号', scene_name String comment '场景名称', lane_no Nullable(Int32) comment '车道编号', lane_direction Nullable(String) comment '车道流向', section_no Nullable(Int32) comment '断面编号', device_direction Nullable(String) comment '雷达朝向', sum_volume_5min Nullable(Int32) comment '每5分钟总流量', start_time DateTime comment '开始时间', day Date comment '日期' ) ENGINE = MergeTree PARTITION BY day PRIMARY KEY day ORDER BY day SETTINGS index_granularity = 8192;
2.2.2.2 ClickHouse中SQL语句
--动态加载数据
select
dwd_st.device_no,
dwd_sc.scene_name,
dwd_st.lane_no,
dwd_rl.lane_direction,
dwd_st.section_no,
dwd_rc.device_direction,
sum(volume_sum) sum_volume_5min,
toDateTime(concat(toString(toDate(create_time)),' ',lpad(toString(extract(hour FROM create_time)), 2, '0'),':',
CASE
WHEN extract(minute FROM create_time) < 5 THEN '00'
WHEN extract(minute FROM create_time) >= 5 AND extract(minute FROM create_time) < 10 THEN '05'
WHEN extract(minute FROM create_time) >= 10 AND extract(minute FROM create_time) < 15 THEN '10'
WHEN extract(minute FROM create_time) >= 15 AND extract(minute FROM create_time) < 20 THEN '15'
WHEN extract(minute FROM create_time) >= 20 AND extract(minute FROM create_time) < 25 THEN '20'
WHEN extract(minute FROM create_time) >= 25 AND extract(minute FROM create_time) < 30 THEN '25'
WHEN extract(minute FROM create_time) >= 30 AND extract(minute FROM create_time) < 35 THEN '30'
WHEN extract(minute FROM create_time) >= 35 AND extract(minute FROM create_time) < 40 THEN '35'
WHEN extract(minute FROM create_time) >= 40 AND extract(minute FROM create_time) < 45 THEN '40'
WHEN extract(minute FROM create_time) >= 45 AND extract(minute FROM create_time) < 50 THEN '45'
WHEN extract(minute FROM create_time) >= 50 AND extract(minute FROM create_time) < 55 THEN '50'
ELSE '55'
END,
':00'
)) as start_time,
cast(dwd_st.day as String) day
from hurys_jw.dwd_statistics as dwd_st
right join hurys_jw.dwd_radar_lane as dwd_rl
on dwd_rl.device_no=dwd_st.device_no and dwd_rl.lane_no=dwd_st.lane_no
right join hurys_jw.dwd_device_scene as dwd_ds
on dwd_ds.device_no=dwd_st.device_no
right join hurys_jw.dwd_scene as dwd_sc
on dwd_sc.scene_id = dwd_ds.scene_id
right join hurys_jw.dwd_radar_config as dwd_rc
on dwd_rc.device_no=dwd_st.device_no
where dwd_st.create_time is not null and dwd_st.lane_no is not null and dwd_st.day >= ?
group by dwd_st.device_no, dwd_sc.scene_name, dwd_st.lane_no, dwd_rl.lane_direction, dwd_st.section_no, dwd_rc.device_direction, toDateTime(concat(toString(toDate(create_time)),' ',lpad(toString(extract(hour FROM create_time)), 2, '0'),':',
CASE
WHEN extract(minute FROM create_time) < 5 THEN '00'
WHEN extract(minute FROM create_time) >= 5 AND extract(minute FROM create_time) < 10 THEN '05'
WHEN extract(minute FROM create_time) >= 10 AND extract(minute FROM create_time) < 15 THEN '10'
WHEN extract(minute FROM create_time) >= 15 AND extract(minute FROM create_time) < 20 THEN '15'
WHEN extract(minute FROM create_time) >= 20 AND extract(minute FROM create_time) < 25 THEN '20'
WHEN extract(minute FROM create_time) >= 25 AND extract(minute FROM create_time) < 30 THEN '25'
WHEN extract(minute FROM create_time) >= 30 AND extract(minute FROM create_time) < 35 THEN '30'
WHEN extract(minute FROM create_time) >= 35 AND extract(minute FROM create_time) < 40 THEN '35'
WHEN extract(minute FROM create_time) >= 40 AND extract(minute FROM create_time) < 45 THEN '40'
WHEN extract(minute FROM create_time) >= 45 AND extract(minute FROM create_time) < 50 THEN '45'
WHEN extract(minute FROM create_time) >= 50 AND extract(minute FROM create_time) < 55 THEN '50'
ELSE '55'
END,
':00'
)), cast(dwd_st.day as String)
;
就先这样,反正ClickHouse和Hive的SQL语句非常非常不一样!!!