Hive 流量分析(含维度和不含维度计算)
流量分析:
指标:PV,UV,访问次数,平均访问时长,人均访问次数、人均访问深度,人均访问时长,回头客占比等...
维度:时间维度,地域维度,设备维度等...
pageview:页面浏览事件
session会话:一个session包含多个pv
UV:一个UV包含一个或多个session
源表DWD_APP_TFC_DTL_DEMO表结构:
hive>create table DWD_APP_TFC_DTL_DEMO(
guid bigint,
eventid String,
event Map<String, String>,
uid String,
imei String,
mac String,
imsi String,
osName String,
osVer String,
androidId String,
resolution String,
deviceType String,
deviceId String,
uuid String,
appid String,
appVer String,
release_ch String,
promotion_ch String,
areacode String,
longtitude Double,
latitude Double,
carrier String,
netType String,
cid_sn String,
ip String,
sessionId String,
`timestamp` bigint,
province String,
city String,
district String,
year string,
month string,
day string,
datestr string
)
partitioned by (dt string);
+--------------------------+-----------------------+-----------------------+--+
| col_name | data_type | comment |
+--------------------------+-----------------------+-----------------------+--+
| guid | bigint | |
| eventid | string | |
| event | map<string,string> | |
| uid | string | |
| imei | string | |
| mac | string | |
| imsi | string | |
| osname | string | |
| osver | string | |
| androidid | string | |
| resolution | string | |
| devicetype | string | |
| deviceid | string | |
| uuid | string | |
| appid | string | |
| appver | string | |
| release_ch | string | |
| promotion_ch | string | |
| areacode | string | |
| longtitude | double | |
| latitude | double | |
| carrier | string | |
| nettype | string | |
| cid_sn | string | |
| ip | string | |
| sessionid | string | |
| timestamp | bigint | |
| province | string | |
| city | string | |
| district | string | |
| year | string | |
| month | string | |
| day | string | |
| datestr | string | |
| dt | string | |
| | NULL | NULL |
| # Partition Information | NULL | NULL |
| # col_name | data_type | comment |
| | NULL | NULL |
| dt | string | |
+--------------------------+-----------------------+-----------------------+--+
DWD_APP_TFC_DTL_DEMO表数据:
链接:https://pan.baidu.com/s/1PTMOg6WCEgFbeow8cvNLHg
提取码:gc2y
数据示例:
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575562642000unkownunkownunkown20191262019-12-062023-03-01-479747698pgviewEventutm_sourceurlhttp://www.kgcedu.cn/abi/pg416referrer_hosthttp://www.kgcedu.cn/aba/pg758utm_contentpgid416utm_campaignreferrertitleutm_mediumutm_term0598972660742431a0-f8-e2-07-46-50-c9
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575563314000unkownunkownunkown20191262019-12-062023-03-01-479747698pgviewEventutm_sourceurlhttp://www.kgcedu.cn/aai/pg406referrer_hosthttp://www.kgcedu.cn/abi/pg416utm_contentpgid406utm_campaignreferrertitleutm_mediumutm_term0598972660742431a0-f8-e2-07-46-50-c9
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575571978000unkownunkownunkown20191262019-12-062023-03-01
流量分析不包含维度:
--创建流量会话聚合表(DWS_APP_TFC_AGS_DEMO)
hive>create table DWS_APP_TFC_AGS_DEMO(
guid bigint,
sessionId string,
start_time bigint, --会话开始时间
end_time bigint, --会话结束时间
pv_counts int --pv数
)
partitioned by (dt string)
stored as orc;
--计算
--源表DWD_APP_TFC_DTL_DEMO
hive>insert into table DWS_APP_TFC_AGS_DEMO partition(dt='2023-03-01')
select
guid,
sessionId,
min(`timestamp`) as start_time,
max(`timestamp`) as end_time, --统计的口径
count(1) as pv_counts
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01'
group by guid,sessionId;
--创建流量人员聚合表(DWS_APP_TFC_AGU_DEMO)
hive>create table DWS_APP_TFC_AGU_DEMO(
guid bigint,
sesion_counts int,
access_time bigint,
pv_counts bigint
)
partitioned by (dt string)
stored as orc;
--计算
--源表 流量会话聚合表(DWS_APP_TFC_AGS_DEMO)
hive>insert into table DWS_APP_TFC_AGU_DEMO partition(dt='2023-03-01')
select
guid, --guid
count(1) as sesion_counts, --一个人的会话总数
sum(end_time-start_time) as access_time, --一个人的会话总时长
sum(pv_counts) as pv_counts ---个人的pv总数
from DWS_APP_TFC_AGS_DEMO where dt='2023-03-01'
group by guid;
--创建表 流量分析概览表(ADS_APP_TFC_OVM_DEMO)
hive>create table ADS_APP_TFC_OVM_DEMO(
dt string,
total_pv int, --总pv数
total_uv int, --总访客数
total_session int, --总访问次数
avg_times_session double, --平均每次访问时长
avg_num_user double, --人均访问次数
avg_pv_user double, --人均访问深度
avg_time_user double, --人均访问时长
returncustomer_ratio double --回头客占比 浏览大于一次的人,叫回头客
)
stored as parquet;
--计算
--源表 流量人员聚合表(DWS_APP_TFC_AGU_DEMO)
hive>insert into ADS_APP_TFC_OVM_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(1) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(1) as avg_num_user,
sum(pv_counts)/count(1) as avg_pv_user,
sum(access_time)/count(1) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(1) as returncustomer_ratio
from DWS_APP_TFC_AGU_DEMO where dt='2023-03-01';
=========================================================================
流量分析包含维度:
--创建流量会话聚合表 --度量值 --维度值(DWS_APP_TFC_AGS_DIM_DEMO)
hive>create table DWS_APP_TFC_AGS_DIM_DEMO(
guid bigint,
sessionId string,
start_time bigint,
end_time bigint,
pv_counts int,
province string,
city string,
district string,
devicetype string,
osname string,
osVer string,
release_ch string,
promotion_ch string
)
partitioned by (dt string)
stored as orc;
---统计口径 :
--计算
--源表 DWD_APP_TFC_DTL_DEMO
hive>insert into table DWS_APP_TFC_AGS_DIM_DEMO partition(dt='2023-03-01')
select
guid,
sessionId,
min(`timestamp`) as start_time,
max(`timestamp`) as end_time,
count(1) as pv_counts,
province,
city,
district,
devicetype,
osname,
osVer,
release_ch,
promotion_ch
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01'
group by guid,sessionId,province,city,district,devicetype,osname,osVer,release_ch,promotion_ch;
--创建流量人员聚合表( DWS_APP_TFC_AGU_DIM_DEMO)
hivecreate table DWS_APP_TFC_AGU_DIM_DEMO(
guid bigint,
sesion_counts int,
access_time bigint,
pv_counts bigint,
province string,
city string,
district string,
devicetype string,
osname string,
osVer string,
release_ch string,
promotion_ch string
)
partitioned by (dt string)
stored as orc;
--计算
--源表 流量会话聚合表( DWS_APP_TFC_AGS_DIM_DEMO )
hive>insert into table DWS_APP_TFC_AGU_DIM_DEMO partition(dt='2023-03-01')
select
guid,
count(1) as sesion_counts,
sum(end_time-start_time) as access_time,
sum(pv_counts) as pv_counts,
province,
city,
district,
devicetype,
osname,
osVer,
release_ch,
promotion_ch
from DWS_APP_TFC_AGS_DIM_DEMO where dt='2023-03-01'
group by guid, province,city,district,devicetype,osname,osVer,release_ch,promotion_ch;
--创建表 流量分析概览表(ADS_APP_TFC_OVM_DIM_DEMO)
-- 8个维度,组合问题
--把所有的维度组合的结果做出来,要什么给什么
hive>create table ADS_APP_TFC_OVM_DIM_DEMO(
dt string,
total_pv int, --总pv数
total_uv int, --总访客数
total_session int, --总访问次数
avg_times_session double, --平均每次访问时长
avg_num_user double, --人均访问次数
avg_pv_user double, --人均访问深度
avg_time_user double, --人均访问时长
returncustomer_ratio double, --回头客占比
province string,
city string,
district string,
devicetype string,
osname string,
osVer string,
release_ch string,
promotion_ch string
)
stored as orc;
--计算
--源表 流量人员聚合表( DWS_APP_TFC_AGU_DIM_DEMO)
--省市区 一个表
--设备类型 一个表
--256个 256张 select
放入一张表
--省市区 一个表
hive>insert into table ADS_APP_TFC_OVM_DIM_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
city,
district,
null,
null,
null,
null,
null
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province,city,district;
---设备
hive>insert into table ADS_APP_TFC_OVM_DIM_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
null,
null,
null,
devicetype,
null,
null,
null,
null
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by devicetype;
---rollup 案例(维度:省、市、区)
hive>create table ADS_APP_TFC_OVM_DIM_ROLLUP_DEMO(
dt string,
total_pv int, --总pv数
total_uv int, --总访客数
total_session int, --总访问次数
avg_times_session double, --平均每次访问时长
avg_num_user double, --人均访问次数
avg_pv_user double, --人均访问深度
avg_time_user double, --人均访问时长
returncustomer_ratio double, --回头客占比
province string,
city string,
district string
)
stored as orc;
hive>insert into table ADS_APP_TFC_OVM_DIM_ROLLUP_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
city,
district
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province,city,district with rollup;
--grouping sets 案例(维度:指定组合)
hive>insert overwrite table ADS_APP_TFC_OVM_DIM_ROLLUP_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
city,
district
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province,city,district grouping sets((),(province),(province,city),(province,city,district));
--只有省维度的
hive>select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
null,
null,
null,
null,
null,
null,
null
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province;
---with cube 案例(穷尽组合)
hive>insert into table ADS_APP_TFC_OVM_DIM_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
city,
district,
devicetype,
osname,
osVer,
release_ch,
promotion_ch
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province,city,district,devicetype,osname,osVer,release_ch,promotion_ch
with cube;
报错:Error: Error while compiling statement: FAILED: SemanticException [Error 10226]: An additional MR job is introduced since the cardinality of grouping sets is more than hive.new.job.grouping.set.cardinalit
y. This functionality is not supported with distincts. Either set hive.new.job.grouping.set.cardinality to a high number (higher than the number of rows per input row due to grouping sets in the query), or rewrite the query to not use distincts. The number of rows per input row due to grouping sets is 256 (state=42000,code=10226)
如果grouping sets大于等于5个维度,将会报如上的错误;解决办法:
1.在的hql语句前面加上 set hive.new.job.grouping.set.cardinality=xx;(例如这里是5个维度,一共32个grouping sets,xx我写的64 )
2.可以通过在子查询中用group by去重,避免在聚合中用到distinct
解决:
hive>set hive.new.job.grouping.set.cardinality=256;
--按照所需要的维度进行提取
hive>select * from ADS_APP_TFC_OVM_DIM_DEMO a where province is not null and city is not null and district is not null
and coalesce(devicetype,osname,osVer,release_ch,promotion_ch) is null;
---计算一次,烟筒式开发
hive>with ags as(
select
guid,
sessionId,
min(`timestamp`) as start_time,
max(`timestamp`) as end_time,
count(1) as pv_counts
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01'
group by guid,sessionId
),
agu as(
select
guid,
count(1) as sesion_counts,
sum(end_time-start_time) as access_time,
sum(pv_counts) as pv_counts
from ags
group by guid
)
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(1) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(1) as avg_num_user,
sum(pv_counts)/count(1) as avg_pv_user,
sum(access_time)/count(1) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(1) as returncustomer_ratio
from agu;
hive>insert into table ADS_APP_TFC_OVM_DIM_DEMO
select
'2023-03-01' as dt,
sum(pv_counts) as total_pv,
count(distinct guid) as total_uv,
sum(sesion_counts) as total_session,
sum(access_time)/sum(sesion_counts) as avg_times_session,
sum(sesion_counts)/count(distinct guid) as avg_num_user,
sum(pv_counts)/count(distinct guid) as avg_pv_user,
sum(access_time)/count(distinct guid) as avg_time_user,
count(if(sesion_counts>1,1,null))/count(distinct guid) as returncustomer_ratio,
province,
city,
district,
devicetype,
osname,
osVer,
release_ch,
promotion_ch
from DWS_APP_TFC_AGU_DIM_DEMO where dt='2023-03-01'
group by province,city,district,devicetype,osname,osVer,release_ch,promotion_ch
with cube;