每日的用户更新表获取的三种方式:
一是监听mysql库数据的变化,比如用canal合并每日的变化,获取到最后的一个状态
二是每天获得一份切片数据,可以通过去两天切片数据的不同来作为每日更新表,可以对所有字段先进性concat,再取md5
三是流水表,有每日的变更流水表。通过etl工具对操作型数据库按照时间字段增量抽取到ods或者数据仓库(每天抽取前一天的数据),形成每天的增量数据(实际中使用最多的情形)
拉链表
用于记录每条信息的生命周期,有开始日期和结束日期字段;
适合缓慢变化维度:数据会变化,但频率不高,做每日全量效率低
优点:既保留了数据的历史状态,又规避了数据的重复存储
缺点:开发/使用 成本略高,大量记录频繁变更会导致存储压缩效果降低
案例
原始表数据2023-03-04凌晨获取
创建外部表orders
//导入原始数据表
create external table orders(
orderid int,
createdate string,
modifiedtime string,
status string
)
row format delimited fields terminated by '\t'
location '/tmp/lalian/orders';
增量分区表
create table ods_orders_inc(
orderid int,
createdate string,
modifiedtime string,
status string
)
partitioned by (day string)
row format delimited fields terminated by '\t';
insert overwrite table ods_orders_inc partition (day='2023-03-03')
历史记录表
create table dws_orders_his(
orderid int,
createdate string,
modifiedtime string,
status string,
start_time string,
end_time string
)
row format delimited fields terminated by '\t';
insert overwrite table dws_orders_his
select orderid,createdate,modifiedtime,status,modifiedtime,'9999-12-31'
from ods_orders_inc where day='2023-03-03';
2023-03-05凌晨获取新的
数据表新增更改
把新增的数据导入到2023-03-04分区中
insert overwrite table ods_orders_inc partition(day='2023-03-04')
select orderid, createdate, modifiedtime, status
from orders
where modifiedtime='2023-3-4';
把2023-03-03与2023-03-04的数据合并
select tb.orderid,tb.createdate,tb.modifiedtime,tb.status,tb.start_time,tb.end_time
from(
(select orderid, createdate, modifiedtime,status,modifiedtime as start_time,'9999-12-31' as end_time
from ods_orders_inc where day='2023-03-04')
union all
(select
t1.orderid,
t1.createdate,
t1.modifiedtime,
t1.status,
t1.start_time,
case when t2.orderid is not null
and t1.end_time >'2023-3-4' then '2023-3-4' else t1.end_time end end_time
from dws_orders_his as t1
left join (select orderid from ods_orders_inc where day='2023-03-04') as t2 on t1.orderid=t2.orderid))tb
order by tb.orderid,tb.start_time;
2023-03-06凌晨获取新的
数据表新增更改
把新增的数据导入到2023-03-05分区中
insert overwrite table ods_orders_inc partition (day='2023-03-05')
select orderid,createdate,modifiedtime,status from orders
where modifiedtime='2023-3-5' or (createdate='2023-3-5' and modifiedtime='2023-3-5');
把2023-03-04与2023-03-05的数据合并
select tb1.*
from (
(select orderid, createdate, modifiedtime, status, modifiedtime start_time, '9999-12-31' end_time
from ods_orders_inc
where day = '2023-03-05')
union all
(select t1.orderid,
t1.createdate,
t1.modifiedtime,
t1.status,
t1.start_time,
`if`(t2.orderid is not null and t1.end_time > '2023-03-05', '2023-03-05', t1.end_time) end_time
from dws_orders_his t1
left join (select orderid, modifiedtime
from ods_orders_inc
where day = '2023-03-05') t2 on t1.orderid = t2.orderid)
) tb1 order by tb1.orderid,tb1.modifiedtime;