view scripts/migration/scrub_duplicate_emails.sql @ 12:c2e2d794847f

new: add config.properties file for gazetteer
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Wed, 30 Sep 2015 13:43:54 +0200
parents a50cf11e5178
children
line wrap: on
line source

--------------------
--REFERENCE QUERIES
--------------------
/*
-- Query to list all user acocunts with duplicate e-mails
select id, username, email from vdcuser
where email in (
select email from vdcuser
group by email
having count(*) > 1
)
order by email

-- Query to list all e-mails that have are duplicated (total = # of actual users, without duplicates)
select email, count(*) from vdcuser
group by email
having count(*) > 1
order by count(*) desc

-- Query to list all e-mails that have are duplicated and reference to original account (account with lowest id)
select u1.id, u1.username, u1.active,u1.email, u2.id, u2.username, u2.active
from vdcuser u1, vdcuser u2
where 1=1
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
)
order by u1.email

-- Delete query, to be run after all the updates
delete from vdcuser where id in (
select u1.id
from vdcuser u1, vdcuser u2
where 1=1
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
)
)

*/
--------------------
--UPDATE QUERIES
--------------------
-- these queries will update the foreign key references in all (relevant) tables to the orignal account
--
-- Generated by
/*
SELECT tc.table_schema, tc.constraint_name, tc.table_name, kcu.column_name, ccu.table_name AS foreign_table_name,ccu.column_name AS foreign_column_name,
'update ' || tc.table_name || ' ref set ' || kcu.column_name || ' = u2.id
from vdcuser u1, vdcuser u2
where  ref.' || kcu.column_name || ' = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);' as query
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu ON ccu.constraint_name = tc.constraint_name
WHERE constraint_type = 'FOREIGN KEY' AND ccu.table_name='vdcuser';
*/
--
--
-- if any of the below fail because of duplicate constraints, you will need to first delete the duplicates
-- here is a sample query for deleting the duplicate entries from studyfile_vdcuser (the most likey to fail))
/*
delete from studyfile_vdcuser
where allowedusers_id || '_' || studyfiles_id  in (
select u1.id || '_' || fu1.studyfiles_id 
from vdcuser u1, vdcuser u2, studyfile_vdcuser fu1, studyfile_vdcuser fu2
where 1=1
and fu1.studyfiles_id = fu2.studyfiles_id
and fu1.allowedusers_id = u1.id
and fu2.allowedusers_id = u2.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser  -- also may need to run with max(id) or some other combinations!
group by email
having count(*) > 1
)
)
*/


update flagged_study_comments ref set user_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.user_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update guestbookresponse ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update harvestingdataverse_vdcuser ref set allowedfileusers_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.allowedfileusers_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update networkrolerequest ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update rolerequest ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update studyaccessrequest ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update studycomment ref set commentcreator_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.commentcreator_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update study ref set creator_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.creator_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update studyfile_vdcuser ref set allowedusers_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.allowedusers_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update study ref set lastupdater_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.lastupdater_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update studylock ref set user_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.user_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update studyrequest ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update study ref set reviewer_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.reviewer_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update study_vdcuser ref set allowedusers_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.allowedusers_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdc ref set creator_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.creator_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdc_fileuser ref set allowedfileusers_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.allowedfileusers_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdcnetwork ref set creator_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.creator_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdcnetwork ref set defaultnetworkadmin_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.defaultnetworkadmin_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdc ref set reviewer_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.reviewer_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdcrole ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdcuser_studycomment ref set vdcuser_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.vdcuser_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdcuser_usergroup ref set users_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.users_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update vdc_fileuser ref set allowedfileusers_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.allowedfileusers_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);

update versioncontributor ref set contributor_id = u2.id
from vdcuser u1, vdcuser u2
where  ref.contributor_id = u1.id
and u1.id != u2.id
and u1.email = u2.email
and u1.email in (
select email from vdcuser
group by email
having count(*) > 1
)
and u2.id in (
select min(id) from vdcuser
group by email
having count(*) > 1
);