ClickHouse 中利用Map类型存储多key数组并进行高效查询
如何使用 Map
类型来存储复杂数据(如嵌套数组)并进行高效的查询。设计一个包含 30 个字段的复杂表,其中使用 Map
类型存储包含数组的 JSON 数据,帮助实现对数据的高效存储与查询。
表结构 SQL
CREATE TABLE user_activity
(
id UInt64 COMMENT '主键,自增',
user_id UInt64 COMMENT '用户ID',
session_id UInt64 COMMENT '会话ID',
activity_date Date COMMENT '行为发生日期',
activity_time DateTime COMMENT '行为发生时间',
event_type String COMMENT '事件类型,例如 login、click、view',
device_type String COMMENT '设备类型,例如 mobile、desktop',
os String COMMENT '操作系统,例如 iOS、Android',
browser String COMMENT '浏览器类型,例如 Chrome、Firefox',
app_version String COMMENT 'App版本号',
device_id String COMMENT '设备唯一标识符',
country String COMMENT '国家',
city String COMMENT '城市',
timezone String COMMENT '时区',
activity_details Map(String, Array(String)) COMMENT '行为详情,键是属性名称,值是属性值数组',
referer String COMMENT '访问来源URL',
ip_address String COMMENT 'IP地址',
session_duration UInt32 COMMENT '会话时长(秒)',
is_new_user UInt8 COMMENT '是否为新用户',
conversion_rate Decimal(5, 2) COMMENT '转化率',
tags Array(String) COMMENT '行为相关的标签数组',
product_ids Array(String) COMMENT '涉及的商品ID数组',
metadata Map(String, String) COMMENT '额外的元数据,例如设备信息',
error_logs Array(String) COMMENT '错误日志数组',
search_keywords Array(String) COMMENT '搜索关键词数组',
recommendation_ids Array(String) COMMENT '推荐的资源ID数组',
request_headers Map(String, String) COMMENT '请求头信息',
user_attributes Map(String, String) COMMENT '用户属性,例如年龄段、性别',
geo_coordinates Array(Float64) COMMENT '地理坐标,经度和纬度',
custom_flags Array(String) COMMENT '用户行为标志数组'
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(activity_date) -- 按月分区
ORDER BY (activity_date, user_id, session_id) -- 按日期、用户和会话排序
SETTINGS index_granularity = 8192;
插入 5000 万条模拟数据
以下是插入模拟数据的 SQL 脚本,包含一个复杂的 Map
字段,模拟用户行为数据的存储。
INSERT INTO user_activity
SELECT
number AS id,
rand() % 1000000 AS user_id,
rand() % 100000000 AS session_id,
toDate('2024-01-01') + rand() % 365 AS activity_date,
toDateTime('2024-01-01 00:00:00') + rand() % 31536000 AS activity_time,
arrayElement(['login', 'click', 'view', 'purchase'], rand() % 4 + 1) AS event_type,
arrayElement(['mobile', 'desktop', 'tablet'], rand() % 3 + 1) AS device_type,
arrayElement(['iOS', 'Android', 'Windows', 'MacOS'], rand() % 4 + 1) AS os,
arrayElement(['Chrome', 'Firefox', 'Safari', 'Edge'], rand() % 4 + 1) AS browser,
concat('v', toString(rand() % 100)) AS app_version,
concat('device_', toString(rand() % 100000)) AS device_id,
arrayElement(['US', 'CN', 'IN', 'DE'], rand() % 4 + 1) AS country,
arrayElement(['New York', 'Shanghai', 'Delhi', 'Berlin'], rand() % 4 + 1) AS city,
arrayElement(['UTC-5', 'UTC+8', 'UTC+5:30', 'UTC+1'], rand() % 4 + 1) AS timezone,
map(
'media_names', array(concat('media_', toString(rand() % 100))),
'platform_ids', array(toString(rand() % 10)),
'tags', array('tag1', 'tag2', 'tag3')
) AS activity_details,
concat('https://referer.com/', toString(rand() % 100)) AS referer,
concat('192.168.', toString(rand() % 256), '.', toString(rand() % 256)) AS ip_address,
rand() % 3600 AS session_duration,
rand() % 2 AS is_new_user,
toDecimal32(rand() % 100 / 100.0, 2) AS conversion_rate,
array('tag1', 'tag2', 'tag3') AS tags,
array(toString(rand() % 1000), toString(rand() % 1000)) AS product_ids,
map('resolution', '1920x1080', 'network', 'wifi') AS metadata,
array('error1', 'error2') AS error_logs,
array('search1', 'search2') AS search_keywords,
array(toString(rand() % 100), toString(rand() % 100)) AS recommendation_ids,
map('User-Agent', 'Mozilla/5.0', 'Accept', 'text/html') AS request_headers,
map('age_group', '18-24', 'gender', 'male') AS user_attributes,
array(rand() % 180 - 90, rand() % 360 - 180) AS geo_coordinates,
array('flag1', 'flag2') AS custom_flags
FROM numbers(50000000);
查询示例
1. 按行为类型分组统计
统计每种行为类型的数量
SELECT
activity_details['tags'] AS tags,
COUNT(*) AS count
FROM user_activity
ARRAY JOIN activity_details['tags'] AS tag
GROUP BY tag
ORDER BY count DESC;
2. 按媒体名称统计用户行为
统计每个媒体名称的使用次数:
SELECT
arrayJoin(activity_details['media_names']) AS media_name,
COUNT(*) AS count
FROM user_activity
GROUP BY media_name
ORDER BY count DESC
LIMIT 10;
实验证明,2秒查询出数组中的group by数据还是蛮快的