维度指标:
指标:新增注册会员数、累计注册会员数、新增消费会员数、累计消费会员数、新增复购会员数、累计复购会员数、活跃会员数、沉睡会员数、会员消费金额 维度: 时间维度(天、周、月) ? 涉及表: 门店会员分类天表 ? 表字段的组成: 维度字段 + 指标结果字段
CREATE TABLE IF NOT EXISTS dws.dws_mem_store_member_classify_day_i(
? trade_date ? ? ? ? ? ? ? ? ? STRING COMMENT '统计时间',
? week_trade_date ? ? ? ? ? ? STRING COMMENT '周一日期',
? month_trade_date ? ? ? ? ? STRING COMMENT '月一日期',
?
? store_no ? ? ? ? ? ? ? ? ? STRING COMMENT '店铺编码',
? store_name ? ? ? ? ? ? ? ? STRING COMMENT '店铺名称',
? store_sale_type ? ? ? ? ? ? BIGINT COMMENT '店铺销售类型',
? store_type_code ? ? ? ? ? ? BIGINT COMMENT '分店类型',
? city_id ? ? ? ? ? ? ? ? ? ? BIGINT COMMENT '城市ID',
? city_name ? ? ? ? ? ? ? ? ? STRING COMMENT '城市名称',
? region_code ? ? ? ? ? ? ? ? STRING COMMENT '区域编码',
? region_name ? ? ? ? ? ? ? ? STRING COMMENT '区域名称',
? is_day_clear ? ? ? ? ? ? ? ?BIGINT COMMENT '是否日清:0否,1是',
?
? reg_num_add ? ? ? ? ? ? ? ? BIGINT COMMENT '新增注册会员数',
? reg_num_sum ? ? ? ? ? ? ? ? BIGINT COMMENT '累计注册会员数',
? consume_num_add ? ? ? ? ? ? BIGINT COMMENT '新增消费会员数',
? consume_num_sum ? ? ? ? ? ? BIGINT COMMENT '累计消费会员数',
? repurchase_num_add ? ? ? ? ?BIGINT COMMENT '新增复购会员数',
? repurchase_num_sum ? ? ? ? ?BIGINT COMMENT '累计复购会员数',
? active_member_num ? ? ? ? ? BIGINT COMMENT '活跃会员数',
? sleep_member_num ? ? ? ? ? ?BIGINT COMMENT '沉睡会员数',
? sale_amount_bind ? ? ? ? ? ?DECIMAL(27, 2) COMMENT '会员消费金额'
)
comment '门店会员分类天表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
活跃会员:30天内有消费
沉睡会员:90天内有消费,30天内没有消费
这个需求的难点在于计算累计值。思路是 使用天进行聚合,得到每天的值,然后使用sum () over 窗口,得到累加值,对于每天的累积情况,这里需要使用拉链表的思想,即构造一个生效日期,这里使用lead() over 窗口函数,取到下一个日期,然后再用这个当日时间去卡,即可得到当日对应的累计值。
-- DWS层: 门店会员分类天
-- 注意: 以下内容仅仅以2023-11-14为例,实际需要把14-20日的所有数据都要导入对应表中
with t1 as (
select
trade_date as start_date,
store_no,
reg_num_add, -- 新增注册会员数
sum(reg_num_add) over(partition by store_no order by trade_date) as reg_num_sum, -- 累计注册会员数
lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_date
from
( -- 先统计每天注册会员数
select
trade_date,
reg_md as store_no,
count(1) as reg_num_add
from dwm.dwm_mem_member_behavior_day_i
where is_register = 1
group by
trade_date, reg_md
) temp1
),
t2 as (
select
trade_date as start_date,
store_no,
consume_num_add, -- 新增消费会员数
sum(consume_num_add) over(partition by store_no order by trade_date) as consume_num_sum, -- 累计消费会员数
lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_date
from (
select
trade_date,
store_no,
count(1) as consume_num_add
from dwm.dwm_mem_first_buy_i
group by trade_date, store_no
) temp2
),
t3 as (
select
trade_date as start_date,
store_no,
repurchase_num_add, -- 新增充值会员数
sum(repurchase_num_add) over(partition by store_no order by trade_date) as repurchase_num_sum, -- 累计充值会员数
lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_date
from (
select
trade_date,
store_no,
count(1) as repurchase_num_add
from dwm.dwm_mem_second_buy_i
group by trade_date, store_no
) temp2
),
t4 as (
-- 活跃会员数(最近30天有消费) 2023-11-14
select
'2023-11-14' as trade_date,
bind_md as store_no,
count(distinct zt_id) as active_member_num
from dwm.dwm_mem_member_behavior_day_i
where trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',30) and is_consume = 1
group by bind_md
),
t5 as (
-- 沉睡会员数: 最近90天有消费 , 但是最近30天无消费
select
'2023-11-14' as trade_date,
temp3.bind_md as store_no,
count(temp3.zt_id) as sleep_member_num
from
(
select
bind_md,
zt_id
from dwm.dwm_mem_member_behavior_day_i
where trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',90) and is_consume = 1
group by bind_md,zt_id
) temp3
LEFT JOIN
(
select
bind_md,
zt_id
from dwm.dwm_mem_member_behavior_day_i
where trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',30) and is_consume = 1
group by bind_md,zt_id
) temp4 on temp3.bind_md = temp4.bind_md and temp3.zt_id = temp4.zt_id
where temp4.zt_id is null
group by temp3.bind_md
),
t6 as (
select
trade_date,
store_no,
sum(real_paid_amount) as sale_amount_bind
from dwm.dwm_mem_sell_order_i
where trade_date = '2023-11-14'
group by trade_date,store_no
),
t7 as (
select
'2023-11-14' as trade_date,
store_no,
if(start_date = '2023-11-14',reg_num_add,0) as reg_num_add,
reg_num_sum,
0 as consume_num_add,
0 as consume_num_sum,
0 as repurchase_num_add,
0 as repurchase_num_sum,
0 as active_member_num,
0 as sleep_member_num,
0 as sale_amount_bind
from t1
where start_date <= '2023-11-14' and end_date >= '2023-11-14'
union all
select
'2023-11-14' as trade_date,
store_no,
0 reg_num_add,
0 as reg_num_sum,
if( start_date = '2023-11-14',consume_num_add,0) as consume_num_add,
consume_num_sum,
0 as repurchase_num_add,
0 as repurchase_num_sum,
0 as active_member_num,
0 as sleep_member_num,
0 as sale_amount_bind
from t2
where start_date <= '2023-11-14' and end_date >= '2023-11-14'
union all
select
'2023-11-14' as trade_date,
store_no,
0 reg_num_add,
0 as reg_num_sum,
0 as consume_num_add,
0 as consume_num_sum,
if(start_date = '2023-11-14',repurchase_num_add,0) as repurchase_num_add,
repurchase_num_sum,
0 as active_member_num,
0 as sleep_member_num,
0 as sale_amount_bind
from t3
where start_date <= '2023-11-14' and end_date >= '2023-11-14'
union all
select
trade_date,
store_no,
0 reg_num_add,
0 as reg_num_sum,
0 as consume_num_add,
0 as consume_num_sum,
0 as repurchase_num_add,
0 as repurchase_num_sum,
active_member_num,
0 as sleep_member_num,
0 as sale_amount_bind
from t4
union all
select
trade_date,
store_no,
0 reg_num_add,
0 as reg_num_sum,
0 as consume_num_add,
0 as consume_num_sum,
0 as repurchase_num_add,
0 as repurchase_num_sum,
0 as active_member_num,
sleep_member_num,
0 as sale_amount_bind
from t5
union all
select
trade_date,
store_no,
0 reg_num_add,
0 as reg_num_sum,
0 as consume_num_add,
0 as consume_num_sum,
0 as repurchase_num_add,
0 as repurchase_num_sum,
0 as active_member_num,
0 as sleep_member_num,
sale_amount_bind
from t6
)
-- insert overwrite table dws.dws_mem_store_member_classify_day_i partition (dt)
select
t7.trade_date,
t8.week_trade_date,
t8.month_trade_date,
t7.store_no,
t9.store_name,
t9.store_sale_type,
t9.store_type_code,
t9.city_id,
t9.city_name,
t9.region_code,
t9.region_name,
t9.is_day_clear,
sum(t7.reg_num_add) as reg_num_add,
sum(t7.reg_num_sum) as reg_num_sum,
sum(t7.consume_num_add) as consume_num_add,
sum(t7.consume_num_sum) as consume_num_sum,
sum(t7.repurchase_num_add) as repurchase_num_add,
sum(t7.repurchase_num_sum) as repurchase_num_sum,
sum(t7.active_member_num) as active_member_num,
sum(t7.sleep_member_num) as sleep_member_num,
sum(t7.sale_amount_bind) as sale_amount_bind,
t7.trade_date as dt
from t7
left join dim.dwd_dim_date_f t8 on t7.trade_date = t8.trade_date
-- 注意: 一定要检查自己的dwd_dim_store_i分区目录,此处填写自己的分区目录时间
left join dim.dwd_dim_store_i t9 on t7.store_no = t9.store_no and t9.dt ='2023-11-23'
group by
t7.trade_date,
t8.week_trade_date,
t8.month_trade_date,
t7.store_no,
t9.store_name,
t9.store_sale_type,
t9.store_type_code,
t9.city_id,
t9.city_name,
t9.region_code,
t9.region_name,
t9.is_day_clear;
指标: 门店销售额、门店总订单量、当日注册人数、累计注册会员数、当日注册且充值会员数、当日注册且充值且消费会员数、当日注册且消费会员数、充值会员数、充值金额、累计会员充值金额、当日有余额的会员人数、当日会员余额、余额消费人数/单量、余额支付金额、余额消费金额、会员消费人数/单量、会员消费金额、会员首单人数/订单量/销售额、会员非首单人数/订单量/销售额 维度: 时间维度(天、周、月) 涉及表: 门店会员统计天表 表字段的组成: 维度字段 + 指标结果字段
CREATE TABLE IF NOT EXISTS dws.dws_mem_store_member_statistics_day_i(
trade_date STRING COMMENT '统计时间',
week_trade_date STRING COMMENT '周一日期',
month_trade_date STRING COMMENT '月一日期',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
store_sale_amount DECIMAL(27, 2) COMMENT '门店销售金额',
store_orders_number BIGINT COMMENT '门店总订单量',
register_member_num BIGINT COMMENT '当日注册人数',
register_member_num_all BIGINT COMMENT '累计注册会员数',
register_recharge_num BIGINT COMMENT '当日注册且充值会员数',
rg_rc_td_num BIGINT COMMENT '当日注册且充值且消费会员数',
register_trade_num BIGINT COMMENT '当日注册且消费会员数',
recharge_member_num BIGINT COMMENT '充值会员数',
recharge_amount DECIMAL(27, 2) COMMENT '充值金额',
recharge_amount_all DECIMAL(27, 2) COMMENT '累计会员充值金额',
remain_member_num BIGINT COMMENT '当日有余额的会员人数',
remain_member_amount DECIMAL(27, 2) COMMENT '当日会员余额',
balance_member_num BIGINT COMMENT '余额消费人数',
balance_member_order_num BIGINT COMMENT '余额消费单量',
balance_pay_amount DECIMAL(27, 2) COMMENT '余额支付金额',
balance_member_amount DECIMAL(27, 2) COMMENT '余额消费金额',
member_num BIGINT COMMENT '会员消费人数',
member_order_num BIGINT COMMENT '会员消费单量',
member_amount DECIMAL(27, 2) COMMENT '会员消费金额',
member_first_num BIGINT COMMENT '会员首单人数',
member_first_order_num BIGINT COMMENT '会员首单订单量',
member_first_amount DECIMAL(27, 2) COMMENT '会员首单销售额',
member_nofirst_num BIGINT COMMENT '会员非首单人数',
member_nofirst_order_num BIGINT COMMENT '会员非首单订单量',
member_nofirst_amount DECIMAL(27, 2) COMMENT '会员非首单销售额'
)
comment '门店会员统计日表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
门店的消费情况可以从dwm_sell_o2o_order_i表中出,
注册、充值、消费这些数据可以从dwm_mem_member_behavior_day_i中出,
余额数据可以从dwd_mem_balance_online_i中出。
需要注意的是,这里有新增的指标还有累计的指标,为了方便计算,可以分开求解。
新增指标可以大部分从dwm_mem_member_behavior_day_i中出,因为 dwm_mem_member_behavior_day_i是会员粒度的表,记录了会员的各种行为。在计算会员指标的时候,很多需要count()来计算的指标,可以转化成sum(1),根据条件进行判断即可。
-- DWS 门店会员统计宽表
-- 注意: 以下内容仅仅以2023-11-14为例,实际需要把14-20日的所有数据都要导入对应表中
with t1 as (
select
trade_date,
store_no,
sum(real_paid_amount) as store_sale_amount,
count(if(trade_type = 0,parent_order_no,NULL)) - count(if(trade_type = 5,parent_order_no,NULL)) as store_orders_number,
0 as register_member_num,
0 as register_member_num_all,
0 as register_recharge_num,
0 as rg_rc_td_num,
0 as register_trade_num,
0 as recharge_member_num,
0 as recharge_amount,
0 as recharge_amount_all,
0 as remain_member_num,
0 as remain_member_amount,
0 as balance_member_num,
0 as balance_member_order_num,
0 as balance_pay_amount,
0 as balance_member_amount,
0 as member_num,
0 as member_order_num,
0 as member_amount,
0 as member_first_num,
0 as member_first_order_num,
0 as member_first_amount,
0 as member_nofirst_num,
0 as member_nofirst_order_num,
0 as member_nofirst_amount
from dwm.dwm_sell_o2o_order_i where dt = '2023-11-14'
group by trade_date,store_no
union all
select
trade_date,
bind_md as store_no,
0 as store_sale_amount, -- 门店销售额
0 as store_orders_number, -- 门店总订单量
sum(is_register) as register_member_num, -- 当日注册人数
0 as register_member_num_all, -- 累计注册人数
sum(
if(
is_register = 1 and is_recharge = 1, 1,0
)
) as register_recharge_num, -- 当日注册且充值会员数
sum(
if(
is_register = 1 and is_recharge = 1 and is_consume = 1, 1,0
)
) as rg_rc_td_num, -- 当日注册 且充值且消费会员数
sum(
if(
is_register = 1 and is_consume = 1, 1,0
)
) as register_trade_num, -- 当日注册且消费会员数
sum(is_recharge) as recharge_member_num, -- 充值会员数
sum(if( is_recharge = 1,recharge_amount,0) ) as recharge_amount, -- 充值金额
0 as recharge_amount_all, -- 累计会员充值金额
0 as remain_member_num, -- 当日有余额的会员人数
0 as remain_member_amount, -- 当日会员余额
sum(is_balance_consume) as balance_member_num, --余额消费人数
sum(if(is_balance_consume = 1, balance_consume_times, 0)) as balance_member_order_num, --余额消费单量
sum(if(is_balance_consume = 1, balance_pay_amount, 0)) as balance_pay_amount, -- 余额支付金额
sum(if(is_balance_consume = 1, balance_consume_amount, 0)) as balance_member_amount, -- 余额消费金额
sum(is_consume) as member_num, -- 会员消费人数
sum(if(is_consume = 1, consume_times, 0)) as member_order_num, -- 会员消费单量
sum(if(is_consume = 1, consume_amount, 0)) as member_amount, -- 会员消费金额
sum(is_first_consume) as member_first_num, -- 会员首单人数
sum(is_first_consume) as member_first_order_num, -- 会员首单订单量
sum(if(is_first_consume = 1, first_consume_amount,0)) as member_first_amount, -- 会员首单销售额
sum(is_consume) - sum(is_first_consume) as member_nofirst_num, -- 会员非首单人数
sum(if(is_consume = 1, consume_times, 0)) - sum(is_first_consume) as member_nofirst_order_num, -- 会员非首单订单量
sum(if(is_consume = 1, consume_amount, 0)) - sum(if(is_first_consume = 1, first_consume_amount,0)) as member_nofirst_amount -- 会员非首单销售额
from dwm.dwm_mem_member_behavior_day_i where dt = '2023-11-14'
group by trade_date,bind_md
union all
select
trade_date,
store_no,
0 as store_sale_amount,
0 as store_orders_number,
0 as register_member_num,
0 as register_member_num_all,
0 as register_recharge_num,
0 as rg_rc_td_num,
0 as register_trade_num,
0 as recharge_member_num,
0 as recharge_amount,
0 as recharge_amount_all,
count(1) as remain_member_num,
sum(balance_amount) as remain_member_amount,
0 as balance_member_num,
0 as balance_member_order_num,
0 as balance_pay_amount,
0 as balance_member_amount,
0 as member_num,
0 as member_order_num,
0 as member_amount,
0 as member_first_num,
0 as member_first_order_num,
0 as member_first_amount,
0 as member_nofirst_num,
0 as member_nofirst_order_num,
0 as member_nofirst_amount
from dwd.dwd_mem_balance_online_i where dt = '2023-11-14'
group by trade_date,store_no
union all
select
start_date as trade_date,
store_no,
0 as store_sale_amount,
0 as store_orders_number,
0 as register_member_num,
register_member_num_all,
0 as register_recharge_num,
0 as rg_rc_td_num,
0 as register_trade_num,
0 as recharge_member_num,
0 as recharge_amount,
recharge_amount_all,
0 as remain_member_num,
0 as remain_member_amount,
0 as balance_member_num,
0 as balance_member_order_num,
0 as balance_pay_amount,
0 as balance_member_amount,
0 as member_num,
0 as member_order_num,
0 as member_amount,
0 as member_first_num,
0 as member_first_order_num,
0 as member_first_amount,
0 as member_nofirst_num,
0 as member_nofirst_order_num,
0 as member_nofirst_amount
from(
select
trade_date as start_date,
store_no,
sum(reg_num_add) over(partition by store_no order by trade_date) as register_member_num_all, -- 累计注册会员数
sum(recharge_amount) over(partition by store_no order by trade_date) as recharge_amount_all, -- 累计充值金额
lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_date
from
( -- 先统计每天注册会员数
select
trade_date,
bind_md as store_no,
sum(is_register) as reg_num_add,
sum(if(is_recharge = 1,recharge_amount,0)) as recharge_amount
from dwm.dwm_mem_member_behavior_day_i
group by
trade_date, bind_md
) temp1
) t
where start_date <= '2023-11-14' and end_date >= '2023-11-14'
)
insert overwrite table dws.dws_mem_store_member_statistics_day_i partition(dt)
select
t1.trade_date,
t2.week_trade_date,
t2.month_trade_date,
t1.store_no,
t3.store_name,
t3.store_sale_type,
t3.store_type_code,
t3.city_id,
t3.city_name,
t3.region_code,
t3.region_name,
t3.is_day_clear,
sum(t1.store_sale_amount) as store_sale_amount,
sum(t1.store_orders_number) as store_orders_number,
sum(t1.register_member_num) as register_member_num,
sum(t1.register_member_num_all) as register_member_num_all,
sum(t1.register_recharge_num) as register_recharge_num,
sum(t1.rg_rc_td_num) as rg_rc_td_num,
sum(t1.register_trade_num) as register_trade_num,
sum(t1.recharge_member_num) as recharge_member_num,
sum(t1.recharge_amount) as recharge_amount,
sum(t1.recharge_amount_all) as recharge_amount_all,
sum(t1.remain_member_num) as remain_member_num,
sum(t1.remain_member_amount) as remain_member_amount,
sum(t1.balance_member_num) as balance_member_num,
sum(t1.balance_member_order_num) as balance_member_order_num,
sum(t1.balance_pay_amount) as balance_pay_amount,
sum(t1.balance_member_amount) as balance_member_amount,
sum(t1.member_num) as member_num,
sum(t1.member_order_num) as member_order_num,
sum(t1.member_amount) as member_amount,
sum(t1.member_first_num) as member_first_num,
sum(t1.member_first_order_num) as member_first_order_num,
sum(t1.member_first_amount) as member_first_amount,
sum(t1.member_nofirst_num) as member_nofirst_num,
sum(t1.member_nofirst_order_num) as member_nofirst_order_num,
sum(t1.member_nofirst_amount) as member_nofirst_amount,
t1.trade_date as dt
from t1
left join dim.dwd_dim_date_f t2 on t1.trade_date = t2.trade_date
-- 注意: 一定要检查自己的dwd_dim_store_i分区目录,此处填写自己的分区目录时间
left join dim.dwd_dim_store_i t3 on t1.store_no = t3.store_no and t3.dt ='2023-11-23'
group by
t1.trade_date,
t2.week_trade_date,
t2.month_trade_date,
t1.store_no,
t3.store_name,
t3.store_sale_type,
t3.store_type_code,
t3.city_id,
t3.city_name,
t3.region_code,
t3.region_name,
t3.is_day_clear;
-- dayofweek
-- 注意: dayofweek是老外从周日算,所以返回的结果和咱们中国人思路差1天
select dayofweek('2023-12-7');
-- 需求: 获取到2023-12-7所在的周中的周一日期
select date_sub('2023-12-7',if(dayofweek('2023-12-7')=1,6,dayofweek('2023-12-7')-2));
-- 需求: 获取到2023-12-7所在的周中的周日日期
select date_sub('2023-12-8',if(dayofweek('2023-12-8')=1,0,dayofweek('2023-12-8')-8));
-- day0fmonth
select dayofmonth('2023-12-07');
-- 需求: 获取2023-12-7所在月的第一天的日期
select date_sub('2023-12-07',dayofmonth('2023-12-07')-1);
-- 需求: 获取2023-12-7所在月的最后一天的日期
select last_day('2023-12-07');
指标:新增注册会员数、累计注册会员数、新增消费会员数、累计消费会员数、新增复购会员数、累计复购会员数、活跃会员数、沉睡会员数、会员消费金额 维度: 时间维度(天、周、月) 涉及ADS表: 门店会员分类月表 和 门店会员分类周表 表字段的组成: 维度字段 + 指标结果字段
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_classify_week_i(
trade_date STRING COMMENT '周一日期',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
reg_num_add BIGINT COMMENT '新增注册会员数',
reg_num_sum BIGINT COMMENT '累计注册会员数',
consume_num_add BIGINT COMMENT '新增消费会员数',
consume_num_sum BIGINT COMMENT '累计消费会员数',
repurchase_num_add BIGINT COMMENT '新增复购会员数',
repurchase_num_sum BIGINT COMMENT '累计复购会员数',
active_member_num BIGINT COMMENT '活跃会员数',
sleep_member_num BIGINT COMMENT '沉睡会员数',
sale_amount_bind DECIMAL(27, 2) COMMENT '会员消费金额'
)
comment '门店会员分类周表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
指标分为累计值和新增值,累计值可以取当周最后一天的数值、新增值可以进行聚合得到。 ? 需要注意的是,这里计算的是一张周表,所以当考虑到数据的场景时,需要取到当周所有的数据进行聚合,以及取到当周最后一天进行取累加值。
思考: 当计算某一天对应这一周的指标, 如果获取这一周相关的数据呢?
where t.dt in (
select
max(dt)
from dws.dws_mem_store_member_classify_day_i
where dt>=date_sub('${inputdate}', if (dayofweek('${inputdate}') = 1, 6, dayofweek('${inputdate}') - 2)) and dt<=date_sub('${inputdate}', if (dayofweek('${inputdate}') = 1, 0, dayofweek('${inputdate}') - 8))
)
代码实现:
with t1 as (
-- 计算非累加值
select
week_trade_date,
store_no,
sum(reg_num_add) as reg_num_add,
sum(consume_num_add) as consume_num_add,
sum(repurchase_num_add) as repurchase_num_add,
sum(sale_amount_bind) as sale_amount_bind
from dws.dws_mem_store_member_classify_day_i
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))
and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
group by week_trade_date,store_no
),
t2 as (
-- 计算 累计值
-- 如果获取这一周的最后一天呢?
select
week_trade_date as trade_date,
store_no,
store_name,
store_sale_type,
store_type_code,
city_id,
city_name,
region_code,
region_name,
is_day_clear,
reg_num_sum,
consume_num_sum,
repurchase_num_sum,
active_member_num,
sleep_member_num
from dws.dws_mem_store_member_classify_day_i where dt in (
select
max(dt) as c1
from dws.dws_mem_store_member_classify_day_i as t
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))
and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
)
)
insert overwrite table ads.ads_mem_store_member_classify_week_i partition (dt)
select
t2.trade_date,
t2.store_no,
t2.store_name,
t2.store_sale_type,
t2.store_type_code,
t2.city_id,
t2.city_name,
t2.region_code,
t2.region_name,
t2.is_day_clear,
t1.reg_num_add,
t2.reg_num_sum,
t1.consume_num_add,
t2.consume_num_sum,
t1.repurchase_num_add,
t2.repurchase_num_sum,
t2.active_member_num,
t2.sleep_member_num,
t1.sale_amount_bind,
t2.trade_date as dt
from t2 left join t1 on t2.trade_date = t1.week_trade_date and t2.store_no = t1.store_no;
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_classify_month_i(
trade_date STRING COMMENT '月一日期',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
reg_num_add BIGINT COMMENT '新增注册会员数',
reg_num_sum BIGINT COMMENT '累计注册会员数',
consume_num_add BIGINT COMMENT '新增消费会员数',
consume_num_sum BIGINT COMMENT '累计消费会员数',
repurchase_num_add BIGINT COMMENT '新增复购会员数',
repurchase_num_sum BIGINT COMMENT '累计复购会员数',
active_member_num BIGINT COMMENT '活跃会员数',
sleep_member_num BIGINT COMMENT '沉睡会员数',
sale_amount_bind DECIMAL(27, 2) COMMENT '会员消费金额'
)
comment '门店会员分类月表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
处理思路: 同周表ads_mem_store_member_classify_week_i,改变下范围即可
思考: 如果获取一个月范围的数据呢?
select date_sub('2023-09-30',dayofmonth('2023-09-30')-1), last_day('2023-09-30')
指标: 门店销售额、门店总订单量、当日注册人数、累计注册会员数、当日注册且充值会员数、当日注册且充值且消费会员数、当日注册且消费会员数、充值会员数、充值金额、累计会员充值金额、当日有余额的会员人数、当日会员余额、余额消费人数/单量、余额支付金额、余额消费金额、会员消费人数/单量、会员消费金额、会员首单人数/订单量/销售额、会员非首单人数/订单量/销售额 维度: 时间维度(天、周、月) 涉及表: 门店会员统计周表 和 门店会员统计月表 涉及表字段: 维度字段 + 指标结果字段
建表语句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_statistics_week_i(
trade_date STRING COMMENT '周一日期',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
store_sale_amount DECIMAL(27, 2) COMMENT '门店销售金额',
store_orders_number BIGINT COMMENT '门店总订单量',
register_member_num BIGINT COMMENT '当日注册人数',
register_member_num_all BIGINT COMMENT '累计注册会员数',
register_recharge_num BIGINT COMMENT '当日注册且充值会员数',
rg_rc_td_num BIGINT COMMENT '当日注册且充值且消费会员数',
register_trade_num BIGINT COMMENT '当日注册且消费会员数',
recharge_member_num BIGINT COMMENT '充值会员数',
recharge_amount DECIMAL(27, 2) COMMENT '充值金额',
recharge_amount_all DECIMAL(27, 2) COMMENT '累计会员充值金额',
remain_member_num BIGINT COMMENT '当周最后一天有余额的会员人数',
remain_member_amount DECIMAL(27, 2) COMMENT '当周最后一天会员余额',
balance_member_num BIGINT COMMENT '余额消费人数',
balance_member_order_num BIGINT COMMENT '余额消费单量',
balance_pay_amount DECIMAL(27, 2) COMMENT '余额支付金额',
balance_member_amount DECIMAL(27, 2) COMMENT '余额消费金额',
member_num BIGINT COMMENT '会员消费人数',
member_order_num BIGINT COMMENT '会员消费单量',
member_amount DECIMAL(27, 2) COMMENT '会员消费金额',
member_first_num BIGINT COMMENT '会员首单人数',
member_first_order_num BIGINT COMMENT '会员首单订单量',
member_first_amount DECIMAL(27, 2) COMMENT '会员首单销售额',
member_nofirst_num BIGINT COMMENT '会员非首单人数',
member_nofirst_order_num BIGINT COMMENT '会员非首单订单量',
member_nofirst_amount DECIMAL(27, 2) COMMENT '会员非首单销售额'
)
comment '门店会员统计周表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
指标分为三种情况: 一种是状态值,比如说累计指标,register_member_num_all等,还有状态指标,remain_member_num等; 另一种情况是可累加的指标,比如金额和单量等; 还有一种情况是不可累积指标,比如人数。 状态值可以从最新的天表中dws_mem_store_member_statistics_day_i获取。 然后以这张表作为主表,关联其他表。 可累加的指标直接从dws_mem_store_member_statistics_day_i中进行聚合得到。 不可累加的指标从dwm_mem_member_behavior_day_i中进行计算得到。
代码实现:
-- ads 门店会员统计周表
with t1 as (
-- 第一部分: 基于DWS层门店会员统计天表 获取指定天的对应这一周的数据, 对这一周进行聚合统计
select
week_trade_date as trade_date,
store_no,
sum(store_sale_amount) as store_sale_amount,
sum(store_orders_number) as store_orders_number,
sum(register_member_num) as register_member_num,
sum(register_recharge_num) as register_recharge_num,
sum(rg_rc_td_num) as rg_rc_td_num,
sum(register_trade_num) as register_trade_num,
sum(recharge_amount) as recharge_amount,
sum(balance_member_order_num) as balance_member_order_num,
sum(balance_pay_amount) as balance_pay_amount,
sum(balance_member_amount) as balance_member_amount,
sum(member_order_num) as member_order_num,
sum(member_amount) as member_amount,
sum(member_first_num) as member_first_num,
sum(member_first_order_num) as member_first_order_num,
sum(member_first_amount) as member_first_amount,
sum(member_nofirst_order_num) as member_nofirst_order_num,
sum(member_nofirst_amount) as member_nofirst_amount
from dws.dws_mem_store_member_statistics_day_i
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))
and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
group by week_trade_date,store_no
),
t2 as (
select
week_trade_date as trade_date,
store_no,
store_name,
store_sale_type,
store_type_code,
city_id,
city_name,
region_code,
region_name,
is_day_clear,
register_member_num_all,
recharge_amount_all,
remain_member_num,
remain_member_amount
from dws.dws_mem_store_member_statistics_day_i where dt in (
select
max(dt)
from dws.dws_mem_store_member_statistics_day_i t
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))
and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
)
),
t3 as (
select
week_trade_date as trade_date,
bind_md as store_no,
count( DISTINCT if(is_recharge = 1,zt_id,NULL) ) AS recharge_member_num,
count( DISTINCT if(is_balance_consume = 1,zt_id,NULL) ) AS balance_member_num,
count( DISTINCT if(is_consume = 1,zt_id,NULL) ) AS member_num,
count( DISTINCT if(is_first_consume = 0 and consume_times > 0,zt_id,NULL) ) AS member_nofirst_num
from dwm.dwm_mem_member_behavior_day_i
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))
and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
group by week_trade_date,bind_md
)
insert overwrite table ads.ads_mem_store_member_statistics_week_i partition (dt)
select
t2.trade_date,
t2.store_no,
t2.store_name,
t2.store_sale_type,
t2.store_type_code,
t2.city_id,
t2.city_name,
t2.region_code,
t2.region_name,
t2.is_day_clear,
t1.store_sale_amount,
t1.store_orders_number,
t1.register_member_num,
t2.register_member_num_all,
t1.register_recharge_num,
t1.rg_rc_td_num,
t1.register_trade_num,
t3.recharge_member_num,
t1.recharge_amount,
t2.recharge_amount_all,
t2.remain_member_num,
t2.remain_member_amount,
t3.balance_member_num,
t1.balance_member_order_num,
t1.balance_pay_amount,
t1.balance_member_amount,
t3.member_num,
t1.member_order_num,
t1.member_amount,
t1.member_first_num,
t1.member_first_order_num,
t1.member_first_amount,
t3.member_nofirst_num,
t1.member_nofirst_order_num,
t1.member_nofirst_amount,
t2.trade_date as dt
from t2 left join t1 on t2.trade_date = t1.trade_date and t2.store_no = t1.store_no
left join t3 on t2.trade_date = t3.trade_date and t2.store_no = t3.store_no;
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_statistics_month_i(
trade_date STRING COMMENT '月一日期',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
store_sale_amount DECIMAL(27, 2) COMMENT '门店销售金额',
store_orders_number BIGINT COMMENT '门店总订单量',
register_member_num BIGINT COMMENT '当日注册人数',
register_member_num_all BIGINT COMMENT '累计注册会员数',
register_recharge_num BIGINT COMMENT '当日注册且充值会员数',
rg_rc_td_num BIGINT COMMENT '当日注册且充值且消费会员数',
register_trade_num BIGINT COMMENT '当日注册且消费会员数',
recharge_member_num BIGINT COMMENT '充值会员数',
recharge_amount DECIMAL(27, 2) COMMENT '充值金额',
recharge_amount_all DECIMAL(27, 2) COMMENT '累计会员充值金额',
remain_member_num BIGINT COMMENT '当月最后一天有余额的会员人数',
remain_member_amount DECIMAL(27, 2) COMMENT '当月最后一天会员余额',
balance_member_num BIGINT COMMENT '余额消费人数',
balance_member_order_num BIGINT COMMENT '余额消费单量',
balance_pay_amount DECIMAL(27, 2) COMMENT '余额支付金额',
balance_member_amount DECIMAL(27, 2) COMMENT '余额消费金额',
member_num BIGINT COMMENT '会员消费人数',
member_order_num BIGINT COMMENT '会员消费单量',
member_amount DECIMAL(27, 2) COMMENT '会员消费金额',
member_first_num BIGINT COMMENT '会员首单人数',
member_first_order_num BIGINT COMMENT '会员首单订单量',
member_first_amount DECIMAL(27, 2) COMMENT '会员首单销售额',
member_nofirst_num BIGINT COMMENT '会员非首单人数(非去重)',
member_nofirst_order_num BIGINT COMMENT '会员非首单订单量',
member_nofirst_amount DECIMAL(27, 2) COMMENT '会员非首单销售额'
)
comment '门店会员统计月表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
同 ads_mem_store_member_statistics_week_i,改变下范围即可
背景
大数据分析类软件发展历程。
==Apache Hadoop-MapReduce==
优点:统一、通用、简单的编程模型,分而治之思想处理海量数据。
缺点:java学习成本高、MR执行慢、内部过程繁琐
==Apache Hive==
优点:SQL on Hadoop。sql语言上手方便。学习成本低。
缺点:底层默认还是MapReduce引擎、慢、延迟高
各种SQL类计算引擎开始出现,主要追求的就是一个问题:==计算如何更快,延迟如何降低==。
==Presto/trino==
Spark On Hive、Spark SQL
Flink
.......
FaceBook维护的原始版本: presto, 也叫prestoDB Presto创始人团队离职后研发并维护的: PrestoSQL 因为版权更名为Trino 已经给大家整理好了对应网址如下: FaceBook维护的, Presto的官网: https://prestodb.io/ 创始人团队维护的, Trino的官网: https://trino.io/ Presto创始人团队维护的, 因为版权更名为Trino: http://github.com/trinodb/trino 相关文章如下: Presto在有赞的实践之路: https://cloud.tencent.com/developer/news/606849 Presto更名为-Trino: https://www.sohu.com/a/441836081_106784
介绍
Presto是一个开源的==分布式SQL查询引擎==,适用于==交互式查询==,数据量支持GB到PB字节。
Presto的设计和编写完全是为了解决==Facebook==这样规模的商业数据仓库交互式分析和处理速度的问题。
presto简介: 一条Presto查询可以将多个数据源进行合并,可以跨越整个组织进行分析; presto特点: Presto以分析师的需求作为目标,他们期望响应速度小于1秒到几分钟;
优缺点
# 优点 1)Presto与Hive对比,都能够处理PB级别的海量数据分析,但Presto是基于内存运算,减少没必要的硬盘IO,所以更快。 2)能够连接多个数据源,跨数据源连表查,如从Hive查询大量网站访问记录,然后从Mysql中匹配出设备信息。 3)部署也比Hive简单,因为Hive是基于HDFS的,需要先部署HDFS。 # 缺点 1)虽然能够处理PB级别的海量数据分析,但不是代表Presto把PB级别都放在内存中计算的。而是根据场景,如count,avg等聚合运算,是边读数据边计算,再清内存,再读数据再计算,这种耗的内存并不高。但是连表查,就可能产生大量的临时数据,因此速度会变慢,反而Hive此时会更擅长。 2)为了达到实时查询,可能会想到用它直连MySql来操作查询,这效率并不会提升,瓶颈依然在MySql,此时还引入网络瓶颈,所以会比原本直接操作数据库要慢。
架构图
Presto是一个运行在多台服务器上的分布式系统。 完整安装包括==一个coordinator和多个worker==。 由客户端提交查询,从Presto命令行CLI提交到coordinator; coordinator进行解析,分析并执行查询计划,然后分发处理队列到worker。
整个presto是一个 M-S架构 (主从架构): coordinator: 主节点 作用: 负责接收客户端发送的SQL, 对SQL进行编译, 形成执行计划, 根据执行计划, 分发给各个从节点进行执行操作 discovery service: 附属节点 作用: 一般内嵌在主节点中, 主要负责维护从节点列表, 当从节点启动后, 都需要到 discovery 节点进行注册操作 worker节点: 从节点 作用: 负责接收coordinator传递过来任务, 对任务进行具体处理工作(读取数据, 处理数据, 将处理后结果数据返回给coordinator)
==Connector== 连接器
1、Presto通过Connector连接器来连接访问不同数据源,例如Hive或mysql。连接器功能类似于数据库的驱动程序。允许Presto使用标准API与资源进行交互。 2、Presto包含几个内置连接器:JMX连接器,可访问内置系统表的System连接器,Hive连接器和旨在提供TPC-H基准数据的TPCH连接器。许多第三方开发人员都贡献了连接器,因此Presto可以访问各种数据源中的数据,比如:ES、Kafka、MongoDB、Redis、Postgre、Druid、Cassandra等。
==Catalog== 连接目录: hive或者mysql等数据源
1、Presto Catalog是数据源schema的上一级,并通过连接器访问数据源。 2、例如,可以配置Hive Catalog以通过Hive Connector连接器提供对Hive信息的访问。 3、在Presto中使用表时,标准表名始终是被支持的。 例如,hive.test_data.test的标准表名将引用hive catalog中test_data schema中的test table。 Catalog需要在Presto的配置文件中进行配置。
==schema== 库
Schema是组织表的一种方式。Catalog和Schema共同定义了一组可以查询的表。 当使用Presto访问Hive或关系数据库(例如MySQL)时,Schema会转换为目标数据库中的对应Schema(database)。 = schema通俗理解就是我们所讲的database. = 想一下在hive中,下面这两个sql是否相等。 show databases; -- presto不支持 show schemas;
==table== 表
...
[root@hadoop01 ~]# /export/server/presto/bin/launcher start Started as 89560 # 可以使用jps 配合kill -9命令 关闭进程
web UI页面
JDBC 驱动:==presto-jdbc-0.245.1.jar==
JDBC 地址:==jdbc:presto://192.168.88.80:8090/hive==
step1:创建连接
由于驱动比较大,好多人经常下载失败,可以按照下图关联资料中提供的包: presto-jdbc-0.245.1.jar
==date_format==(timestamp, format) ==> varchar
作用: 将指定的日期对象转换为字符串操作
==date_parse==(string, format) → timestamp
作用: 用于将字符串的日期数据转换为日期对象
select date_format( timestamp '2020-10-10 12:50:50' , '%Y/%m/%d %H:%i:%s'); select date_format( date_parse('2020:10:10 12-50-50','%Y:%m:%d %H-%i-%s') ,'%Y/%m/%d %H:%i:%s'); ---- 注意: 参数一必须是日期对象 所以如果传递的是字符串, 必须将先转换为日期对象: 方式一: 标识为日期对象, 但是格式必须为标准日期格式 timestamp '2020-10-10 12:50:50' date '2020-10-10' 方式二: 如果不标准,先用date_parse解析成为标准 date_parse('2020-10-10 12:50:50','%Y-%m-%d %H:%i:%s') 扩展说明: 日期format格式说明 年:%Y 月:%m 日:%d 时:%H 分:%i 秒:%s 周几:%w(0..6)
==date_add==(unit, value, timestamp) → [same as input]
作用: 用于对日期数据进行 加 减 操作
==date_diff==(unit, timestamp1, timestamp2) → bigint
作用: 用于比对两个日期之间差值
select date_add('hour',3,timestamp '2021-09-02 15:59:50'); select date_add('day',-1,timestamp '2021-09-02 15:59:50'); select date_add('month',-1,timestamp '2021-09-02 15:59:50'); select date_diff('year',timestamp '2020-09-02 06:30:30',timestamp '2021-09-02 15:59:50') select date_diff('month',timestamp '2021-06-02 06:30:30',timestamp '2021-09-02 15:59:50') select date_diff('day',timestamp '2021-08-02 06:30:30',timestamp '2021-09-02 15:59:50')
数据存储优化
--1)合理设置分区 与Hive类似,Presto会根据元信息读取分区数据,合理的分区能减少Presto数据读取量,提升查询性能。 --2)使用列式存储 Presto对ORC文件读取做了特定优化,因此在Hive中创建Presto使用的表时,建议采用ORC格式存储。相对于Parquet,Presto对ORC支持更好。 Parquet和ORC一样都支持列式存储,但是Presto对ORC支持更好,而Impala对Parquet支持更好。在数仓设计时,要根据后续可能的查询引擎合理设置数据存储格式。 --3)使用压缩 数据压缩可以减少节点间数据传输对IO带宽压力,对于需要快速解压的,建议采用Snappy压缩。 --4)预先排序 对于已经排序的数据,在查询的数据过滤阶段,ORC格式支持跳过读取不必要的数据。比如对于经常需要过滤的字段可以预先排序。
SQL优化
列裁剪
分区裁剪
group by优化
按照数据量大小降序排列
order by使用limit
==join时候大表放置在左边==
...
替换非ORC格式的Hive表
内存管理机制--内存分类
Presto管理的内存分为两大类:==user memory==和==system memory==
user memory用户内存
跟用户数据相关的,比如读取用户输入数据会占据相应的内存,这种内存的占用量跟用户底层数据量大小是强相关的
system memory系统内存
执行过程中衍生出的副产品,比如tablescan表扫描,write buffers写入缓冲区,跟查询输入的数据本身不强相关的内存。
内存管理机制--内存池
==内存池中来实现分配user memory和system memory==。
内存池为常规内存池GENERAL_POOL、预留内存池RESERVED_POOL。
1、GENERAL_POOL:在一般情况下,一个查询执行所需要的user/system内存都是从general pool中分配的,reserved pool在一般情况下是空闲不用的。 2、RESERVED_POOL:大部分时间里是不参与计算的,但是当集群中某个Worker节点的general pool消耗殆尽之后,coordinator会选择集群中内存占用最多的查询,把这个查询分配到reserved pool,这样这个大查询自己可以继续执行,而腾出来的内存也使得其它的查询可以继续执行,从而避免整个系统阻塞。 注意: reserved pool到底多大呢?这个是没有直接的配置可以设置的,他的大小上限就是集群允许的最大的查询的大小(query.total-max-memory-per-node)。 reserved pool也有缺点,一个是在普通模式下这块内存会被浪费掉了,二是大查询可以用Hive来替代。因此也可以禁用掉reserved pool(experimental.reserved-pool-enabled设置为false),那系统内存耗尽的时候没有reserved pool怎么办呢?它有一个OOM Killer的机制,对于超出内存限制的大查询SQL将会被系统Kill掉,从而避免影响整个presto。
内存相关参数
1、user memory用户内存参数 query.max-memory-per-node:单个query操作在单个worker上user memory能用的最大值 query.max-memory:单个query在整个集群中允许占用的最大user memory 2、user+system总内存参数 query.max-total-memory-per-node:单个query操作可在单个worker上使用的最大(user + system)内存 query.max-total-memory:单个query在整个集群中允许占用的最大(user + system) memory 当这些阈值被突破的时候,query会以insufficient memory(内存不足)的错误被终结。 3、协助阻止机制 在高内存压力下保持系统稳定。当general pool常规内存池已满时,操作会被置为blocked阻塞状态,直到通用池中的内存可用为止。此机制可防止激进的查询填满JVM堆并引起可靠性问题。 4、其他参数 memory.heap-headroom-per-node:这个内存是JVM堆中预留给第三方库的内存分配,presto无法跟踪统计,默认值是-Xmx * 0.3 5、结论 GeneralPool = 服务器总内存 - ReservedPool - memory.heap-headroom-per-node - Linux系统内存 常规内存池内存大小=服务器物理总内存-服务器linux操作系统内存-预留内存池大小-预留给第三方库内存
内存优化建议
常见的报错解决
1、Query exceeded per-node total memory limit of xx 适当增加query.max-total-memory-per-node。 2、Query exceeded distributed user memory limit of xx 适当增加query.max-memory。 3、Could not communicate with the remote task. The node may have crashed or be under too much load 内存不够,导致节点crash,可以查看/var/log/message。
建议参数设置
1、query.max-memory-per-node和query.max-total-memory-per-node是query操作使用的主要内存配置,因此这两个配置可以适当加大。 memory.heap-headroom-per-node是三方库的内存,默认值是JVM-Xmx * 0.3,可以手动改小一些。 1) 各节点JVM内存推荐大小: 当前节点剩余内存*80% 2) 对于heap-headroom-pre-node第三方库的内存配置: 建议jvm内存的%15左右 3) 在配置的时候, 不要正正好好, 建议预留一点点, 以免出现问题 数据量在35TB , presto节点数量大约在30台左右 (128GB内存 + 8核CPU) 注意: 1、query.max-memory-per-node小于query.max-total-memory-per-node。 2、query.max-memory小于query.max-total-memory。 3、query.max-total-memory-per-node 与memory.heap-headroom-per-node 之和必须小于 jvm max memory,也就是jvm.config 中配置的-Xmx。
需求一: 会员首次充值(统计每个会员首次充值的时间, 交易单ID以及对应门店和充值金额) 需求二: 门店新老会员消费(统计每个门店每个月新会员、老会员、全部会员、非会员的数量、消费金额、消费单量(新会员指的首次消费后30天内, 老会员指的首次消费后大于30天)) 需求三: 会员复购统计(留存)(统计的指标为统计日期用户量、一日后用户量、二日后用户量、三日后用户量、四日后用户量、五日后用户量、六日后用户量) 需求四: 会员贡献(统计各个会员每天在各个门店消费单量、消费金额、消费成本、线上订单量、线上消费金额、线上消费成本、线下订单量、线下消费金额、线下消费成本) 涉及表: 会员首次充值表 和 门店新老会员消费月表 和 会员复购统计天表 以及 会员贡献天表
CREATE TABLE IF NOT EXISTS ads.ads_mem_member_first_recharge_i(
trade_date_time STRING COMMENT '交易时间',
trade_date STRING COMMENT '日期',
trade_order_id STRING COMMENT '对应的交易单id',
zt_id BIGINT COMMENT '中台 会员id',
store_no STRING COMMENT '门店编号',
city_id BIGINT COMMENT '城市ID',
recharge_amount DECIMAL(27, 2) COMMENT '充值金额'
)
comment '会员首次充值表'
partitioned by (dt STRING COMMENT '统计日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
说明: 同dwm_mem_first_buy_i
insert into hive.ads.ads_mem_member_first_recharge_i
with t1 as (
select
date_format(trade_date,'%Y-%m-%d %H:%i:%s') as trade_date_time,
date_format(trade_date,'%Y-%m-%d') as trade_date,
trade_order_id,
zt_id,
store_no,
city_id,
amount as recharge_amount,
row_number() over(partition by zt_id order by trade_date) as rn
from hive.ods.ods_mem_store_amount_record_i where record_type = 2 and date_format(trade_date,'%Y-%m-%d') = '2023-11-20'
)
select
t1.trade_date_time,
t1.trade_date,
t1.trade_order_id,
t1.zt_id,
t1.store_no,
t1.city_id,
t1.recharge_amount,
'2023-11-20' as dt
from t1
left join hive.ads.ads_mem_member_first_recharge_i t
on t1.zt_id = t.zt_id and t1.store_no = t.store_no and t.dt < '2023-11-20'
where rn = 1 and t.zt_id is null;
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_new_old_member_month_i(
trade_date STRING COMMENT '月一时间',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
member_type BIGINT COMMENT '会员类型:1新会员,2老会员,3会员,4非会员',
member_num BIGINT COMMENT '消费会员数',
sale_amount DECIMAL(27, 2) COMMENT '消费金额',
order_num BIGINT COMMENT '消费单量'
)
comment '门店新老会员消费月表'
partitioned by (dt STRING COMMENT '消费日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
新会员:首次消费后30天内的; 老会员:首次消费后大于30天; 需要统计每个门店每个月新会员、老会员、全部会员、非会员的数量、消费金额、消费单量。 注意:这里是一个月表,在判断新老会员的时候,按照当月最后一天为标准,往前推30天,30天内的为新会员。比如今天是5月28日,在计算5月份的数据时,4月29日——5月28日这30天的都是新会员。而在计算4月份数据时,因为4月份已经过去了,所以以4月30日为最后一天,4月1日——4月30日为4月份的新会员。
1)在计算月表时,需要取到当月最后一天,然后以最后一天为标准,取到前30天
select
a.trade_date
,date_sub(a.trade_date, 30) as day30 -- 前30天
,a.month_trade_date -- 对应的月一时间
from dim.dwd_dim_date_f a
inner join
(select max(dt) mdt from dws.dws_mem_store_member_statistics_day_i -- 取到最大的分区
where dt>=date_sub('${inputdate}', dayofmonth('${inputdate}') - 1)
and dt<=last_day('${inputdate}') ) b
on a.trade_date = b.mdt
2)新会员:30天内首次消费的会员
取新会员,可以使用首次消费表,取前30天到当月最大一天的会员即可。
因为在hive的where语句中不能使用子查询,所以这里使用join的方式解决
with dtt as (
select
a.trade_date
,date_sub(a.trade_date, 30) as day30 -- 前30天
,a.month_trade_date -- 对应的月一时间
from dim.dwd_dim_date_f a
inner join
(select max(dt) mdt from dws.dws_mem_store_member_statistics_day_i -- 取到最大的分区
where dt>=date_sub('${inputdate}', dayofmonth('${inputdate}') - 1)
and dt<=last_day('${inputdate}') ) b
on a.trade_date = b.mdt
),
zt as (select s.zt_id from dwm.dwm_mem_first_buy_i s -- 取到最大分区与其前30天的数据
cross join dtt
where s.dt >= dtt.day30 and s.dt <= dtt.trade_date)
代码实现:
insert into hive.ads.ads_mem_store_new_old_member_month_i
with t1 as (
select
trade_date,
date_format(date_add('day',-30,date '2023-11-20'),'%Y-%m-%d') as day30,
month_trade_date,
month_end_date
from hive.dim.dwd_dim_date_f where trade_date = (
select
max(dt)
from hive.dws.dws_mem_store_member_statistics_day_i
where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')
and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')
)
),
t2 as (
-- 获取最近30天有过消费的新用户
select
temp1.trade_date,
temp1.zt_id,
temp1.store_no
from hive.dwm.dwm_mem_first_buy_i temp1
cross join t1
where temp1.dt >= t1.day30 and temp1.dt <= t1.trade_date
),
t3 as (
-- 获取 最近30天新用户的消费
select
1 as member_type,
temp2.bind_md as store_no,
count(distinct temp2.zt_id) as member_num,
sum(consume_amount) as sale_amount,
sum(consume_times) as order_num
from hive.dwm.dwm_mem_member_behavior_day_i temp2
join t2 on temp2.zt_id = t2.zt_id and temp2.bind_md = t2.store_no
where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')
and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')
and consume_times > 0
group by
temp2.bind_md
union all
-- 获取 老会员
select
2 as member_type,
temp2.bind_md as store_no,
count(distinct temp2.zt_id) as member_num,
sum(consume_amount) as sale_amount,
sum(consume_times) as order_num
from hive.dwm.dwm_mem_member_behavior_day_i temp2
left join t2 on temp2.zt_id = t2.zt_id and temp2.bind_md = t2.store_no
where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')
and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')
and consume_times > 0 and t2.zt_id is null
group by
temp2.bind_md
union all
-- 获取 全部会员
select
3 as member_type,
temp2.bind_md as store_no,
count(distinct temp2.zt_id) as member_num,
sum(consume_amount) as sale_amount,
sum(consume_times) as order_num
from hive.dwm.dwm_mem_member_behavior_day_i temp2
where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')
and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')
and consume_times > 0
group by
temp2.bind_md
union all
-- 非会员数据
select
4 as member_type,
store_no,
0 as member_num,
sum(real_paid_amount) as sale_amount,
count(if(trade_type = 0,parent_order_no,NULL)) - count(if(trade_type = 5,parent_order_no,NULL)) as order_num
from hive.dwm.dwm_sell_o2o_order_i
where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')
and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')
and member_type = 0
group by store_no
)
select
t1.month_trade_date as trade_date,
t3.store_no,
t4.store_name,
t4.store_sale_type,
t4.store_type_code,
t4.city_id,
t4.city_name,
t4.region_code,
t4.region_name,
t4.is_day_clear,
t3.member_type,
t3.member_num,
cast(t3.sale_amount as decimal(27,2)),
t3.order_num,
t1.month_trade_date as dt
from t3 cross join t1
-- 注意: 一定要检查自己的dwd_dim_store_i分区目录,此处填写自己的分区目录时间
left join hive.dim.dwd_dim_store_i t4 on t3.store_no = t4.store_no and t4.dt = '2023-11-23';
CREATE TABLE IF NOT EXISTS ads.ads_mem_repurchase_day_i(
trade_date STRING COMMENT '统计时间',
store_no STRING COMMENT '店铺编码',
store_name STRING COMMENT '店铺名称',
store_sale_type BIGINT COMMENT '店铺销售类型',
store_type_code BIGINT COMMENT '分店类型',
city_id BIGINT COMMENT '城市ID',
city_name STRING COMMENT '城市名称',
region_code STRING COMMENT '区域编码',
region_name STRING COMMENT '区域名称',
is_day_clear BIGINT COMMENT '是否日清:0否,1是',
member_count BIGINT COMMENT '统计日期用户量',
next_member_count_1 BIGINT COMMENT '一日后用户量',
next_member_count_2 BIGINT COMMENT '二日后用户量',
next_member_count_3 BIGINT COMMENT '三日后用户量',
next_member_count_4 BIGINT COMMENT '四日后用户量',
next_member_count_5 BIGINT COMMENT '五日后用户量',
next_member_count_6 BIGINT COMMENT '六日后用户量'
)
comment '会员复购统计天表'
partitioned by (dt STRING COMMENT '消费日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
复购是一个非常重要的指标,用来衡量客户的粘性。这个需求需要统计当天下单的用户,一日、二日到六日的复购情况,为了方便后续使用,这里不直接统计复购率,而是统计人数。所以,这个需求需要统计的指标为统计日期用户量、一日后用户量、二日后用户量、三日后用户量、四日后用户量、五日后用户量、六日后用户量。本需求类似于计算留存,也就是统计当天的用户,在1日、2日、3日。。。之后是否再次购买。
使用dwm_mem_member_behavior_day_i表进行计算。因为需求中最多需要计算六日后的用户量,所以当天中的这些消费用户,需要6天之后,才能拿到所有的数据(1日后,2日后...6日后)。换个角度看,只有6天前的数据才会稳定,6天内的分区对应的数据每天都要进行更新,所以,每天要更新6个分区的数据。 如果计算n天后的复购人数,其实就是用第一天的会员与第n天的会员进行关联,这里使用左关联,关联条件为会员id以及日期 能关联上的,即是复购的用户,然后再count()则可得到相应数值。
代码实现:
-- 六天前消费用户和往后每一天的复购情况
--会员主题: ADS层 会员复购天表
-- 需求: 计算某一天及相对于第一天往后六天每天的复购的人数
-- 思路: 首先知道6天前的那一天的所有的消费用户 基于这个结果 left join 往后1天的所有消费用户 left join 往后2天的所有消费用户 ...往后6天的所有消费用户
select
date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as trade_date,
s.store_no,
s.store_name,
s.store_sale_type,
s.store_type_code,
s.city_id,
s.city_name,
s.region_code,
s.region_name,
s.is_day_clear,
count(day0.zt_id) as member_count,
count(day1.zt_id) as next_member_count_1,
count(day2.zt_id) as next_member_count_2,
count(day3.zt_id) as next_member_count_3,
count(day4.zt_id) as next_member_count_4,
count(day5.zt_id) as next_member_count_5,
count(day6.zt_id) as next_member_count_6,
date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as dt
from (
select
t.zt_id,
t.bind_md,
t.dt as after,
tt.days_after1,
tt.days_after2,
tt.days_after3,
tt.days_after4,
tt.days_after5,
tt.days_after6
from hive.dwm.dwm_mem_member_behavior_day_i t left join hive.dim.dwd_dim_date_f tt on t.trade_date = tt.trade_date
where dt = date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) as day0
left join (
select
zt_id,
bind_md,
dt as after1
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = date_format(date_add('day',-5,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day1 on day0.days_after1 = day1.after1 and day0.zt_id = day1.zt_id
left join (
select
zt_id,
bind_md,
dt as after2
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = date_format(date_add('day',-4,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day2 on day0.days_after2 = day2.after2 and day0.zt_id = day2.zt_id
left join (
select
zt_id,
bind_md,
dt as after3
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = date_format(date_add('day',-3,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day3 on day0.days_after3 = day3.after3 and day0.zt_id = day3.zt_id
left join (
select
zt_id,
bind_md,
dt as after4
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = date_format(date_add('day',-2,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day4 on day0.days_after4 = day4.after4 and day0.zt_id = day4.zt_id
left join (
select
zt_id,
bind_md,
dt as after5
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = date_format(date_add('day',-1,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day5 on day0.days_after5 = day5.after5 and day0.zt_id = day5.zt_id
left join (
select
zt_id,
bind_md,
dt as after6
from hive.dwm.dwm_mem_member_behavior_day_i
where dt = '2023-09-20' and consume_times>0
) day6 on day0.days_after6 = day6.after6 and day0.zt_id = day6.zt_id
join hive.dim.dwd_dim_store_i s on day0.bind_md = s.store_no and s.dt = '2023-09-24'
group by
s.store_no,
s.store_name,
s.store_sale_type,
s.store_type_code,
s.city_id,
s.city_name,
s.region_code,
s.region_name,
s.is_day_clear;
另一种写法: 直接计算出 6天 及其每一天和后面六天的数据
--会员主题: ADS层 会员复购天表
-- 需求: 计算某一天及相对于第一天往后六天每天的复购的人数
-- 思路: 首先知道6天前的那一天的所有的消费用户 基于这个结果 left join 往后1天的所有消费用户 left join 往后2天的所有消费用户 ...往后6天的所有消费用户
select
day0.after as trade_date,
s.store_no,
s.store_name,
s.store_sale_type,
s.store_type_code,
s.city_id,
s.city_name,
s.region_code,
s.region_name,
s.is_day_clear,
count(day0.zt_id) as member_count,
count(day1.zt_id) as next_member_count_1,
count(day2.zt_id) as next_member_count_2,
count(day3.zt_id) as next_member_count_3,
count(day4.zt_id) as next_member_count_4,
count(day5.zt_id) as next_member_count_5,
count(day6.zt_id) as next_member_count_6,
date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as dt
from (
-- 获取 统计日期前6天的所有的消费数据
select
t.zt_id,
t.bind_md,
t.dt as after,
tt.days_after1,
tt.days_after2,
tt.days_after3,
tt.days_after4,
tt.days_after5,
tt.days_after6
from hive.dwm.dwm_mem_member_behavior_day_i t left join hive.dim.dwd_dim_date_f tt on t.trade_date = tt.trade_date
where dt >= date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d')
and dt <= '2023-09-20'
and consume_times>0
) as day0
left join (
-- 获取 统计日期前5天和 后1天的的所有的消费数据
select
zt_id,
bind_md,
dt as after1
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= date_format(date_add('day',-5,date '2023-09-20'),'%Y-%m-%d')
and dt <= date_format(date_add('day',1,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day1 on day0.days_after1 = day1.after1 and day0.zt_id = day1.zt_id
left join (
-- 获取 统计日期前4天和 后2天的的所有的消费数据
select
zt_id,
bind_md,
dt as after2
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= date_format(date_add('day',-4,date '2023-09-20'),'%Y-%m-%d')
and dt <= date_format(date_add('day',2,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day2 on day0.days_after2 = day2.after2 and day0.zt_id = day2.zt_id
left join (
-- 获取 统计日期前3天和 后3天的的所有的消费数据
select
zt_id,
bind_md,
dt as after3
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= date_format(date_add('day',-3,date '2023-09-20'),'%Y-%m-%d')
and dt <= date_format(date_add('day',3,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day3 on day0.days_after3 = day3.after3 and day0.zt_id = day3.zt_id
left join (
-- 获取 统计日期前2天和 后4天的的所有的消费数据
select
zt_id,
bind_md,
dt as after4
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= date_format(date_add('day',-2,date '2023-09-20'),'%Y-%m-%d')
and dt <= date_format(date_add('day',4,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day4 on day0.days_after4 = day4.after4 and day0.zt_id = day4.zt_id
left join (
-- 获取 统计日期前1天和 后5天的的所有的消费数据
select
zt_id,
bind_md,
dt as after5
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= date_format(date_add('day',-1,date '2023-09-20'),'%Y-%m-%d')
and dt <= date_format(date_add('day',5,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day5 on day0.days_after5 = day5.after5 and day0.zt_id = day5.zt_id
left join (
-- 获取 统计日期后6天的的所有的消费数据
select
zt_id,
bind_md,
dt as after6
from hive.dwm.dwm_mem_member_behavior_day_i
where dt >= '2023-09-20'
and dt <= date_format(date_add('day',6,date '2023-09-20'),'%Y-%m-%d')
and consume_times>0
) day6 on day0.days_after6 = day6.after6 and day0.zt_id = day6.zt_id
join hive.dim.dwd_dim_store_i s on day0.bind_md = s.store_no
group by
day0.after,
s.store_no,
s.store_name,
s.store_sale_type,
s.store_type_code,
s.city_id,
s.city_name,
s.region_code,
s.region_name,
s.is_day_clear;