进行数据管理时,无效数据可能会对生产力和决策质量造成严重的影响。如何发现和处理无效数据变得愈发重要。一起来唠唠你会如何处理无效数据吧~
方向一:介绍无效数据的概念
最近遇到了pg数据库表中的大量数据重复了,需要删除其中的一条。一条条删除显然不切合实际,还是需要通过计算来删除。
方向二:无效数据的处理方法
实施步骤:
1.对原表进行备份
2.使用一个表结构完全一样的临时表对原始表进行接收。
create table tmp_0524_1 as select * from public.m_user_bak20230524;
3.找出重复的数据:
drop table tmp_0524_2;
create table tmp_0524_2 as
select * from (
select phone ,count(1) cn from tmp_0524_1 where phone is not null group by phone ) t where t.cn > 1 ;4.用第二个临时表接收数据:
drop table tmp_0524_3_1;
create table tmp_0524_3_1 as
select t.* from tmp_0524_1 t where phone in (select phone from tmp_0524_2 ) and opt_user is null;5.建一个表结构一样的空表,多增加一个iid字段
create table tmp_0524_3 as select '' iid , t.* from tmp_0524_3_1 where 1 = 2;
6.给空表增加一个iid字段,建立一个字段自增
CREATE SEQUENCE tmp_0524_3_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
alter table tmp_0524_3 alter column iid set default nextval('tmp_0524_3_id_seq');7.插入数据:
INSERT INTO public.tmp_0524_3
(id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company)
select id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company
from tmp_0524_3_1;8.删除相同数据中的一条:
DELETE FROM tmp_0524_3
WHERE iid
NOT IN (
SELECT max(iid)
FROM tmp_0524_3
GROUP BY phone
);9.删除原始表中的有相同数据的数据
delete from m_user where phone in (select phone from tmp_0524_3);
10.将处理好的数据插回原始表:
INSERT INTO public.m_user
(id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company)
select id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company
from tmp_0524_3;
方向三:如何减少无效数据
在并发系统中应该增加锁对数据进行插入,避免重复的插入数据。
方向四:实际案例举例
案例:
最近遇到了pg数据库表中的大量数据重复了,需要删除其中的一条。一条条删除显然不切合实际,还是需要通过计算来删除。
实施步骤:
1.对原表进行备份
2.使用一个表结构完全一样的临时表对原始表进行接收。
create table tmp_0524_1 as select * from public.m_user_bak20230524;
3.找出重复的数据:
drop table tmp_0524_2;
create table tmp_0524_2 as
select * from (
select phone ,count(1) cn from tmp_0524_1 where phone is not null group by phone ) t where t.cn > 1 ;4.用第二个临时表接收数据:
drop table tmp_0524_3_1;
create table tmp_0524_3_1 as
select t.* from tmp_0524_1 t where phone in (select phone from tmp_0524_2 ) and opt_user is null;5.建一个表结构一样的空表,多增加一个iid字段
create table tmp_0524_3 as select '' iid , t.* from tmp_0524_3_1 where 1 = 2;
6.给空表增加一个iid字段,建立一个字段自增
CREATE SEQUENCE tmp_0524_3_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
alter table tmp_0524_3 alter column iid set default nextval('tmp_0524_3_id_seq');7.插入数据:
INSERT INTO public.tmp_0524_3
(id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company)
select id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company
from tmp_0524_3_1;8.删除相同数据中的一条:
DELETE FROM tmp_0524_3
WHERE iid
NOT IN (
SELECT max(iid)
FROM tmp_0524_3
GROUP BY phone
);9.删除原始表中的有相同数据的数据
delete from m_user where phone in (select phone from tmp_0524_3);
10.将处理好的数据插回原始表:
INSERT INTO public.m_user
(id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company)
select id, user_id, phone, user_name, id_card, wx_account, state, app_id, app_name, head_icon, sex, sex_name, reg_time, user_state, user_state_name, open_id, opt_user, birthday_type, birthday, org_name, last_trading_time, org_code, union_id, orig_phone, update_time, belong_code, belong_name, data_type, client_num, client_name, country_code, inviter, channel_id, channel_name, company
from tmp_0524_3;