Mercurial > hg > LGDataverses
diff scripts/migration/scrub_duplicate_emails.sql @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/migration/scrub_duplicate_emails.sql Tue Sep 08 17:00:21 2015 +0200 @@ -0,0 +1,497 @@ +-------------------- +--REFERENCE QUERIES +-------------------- +/* +-- Query to list all user acocunts with duplicate e-mails +select id, username, email from vdcuser +where email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +order by email + +-- Query to list all e-mails that have are duplicated (total = # of actual users, without duplicates) +select email, count(*) from vdcuser +group by email +having count(*) > 1 +order by count(*) desc + +-- Query to list all e-mails that have are duplicated and reference to original account (account with lowest id) +select u1.id, u1.username, u1.active,u1.email, u2.id, u2.username, u2.active +from vdcuser u1, vdcuser u2 +where 1=1 +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +) +order by u1.email + +-- Delete query, to be run after all the updates +delete from vdcuser where id in ( +select u1.id +from vdcuser u1, vdcuser u2 +where 1=1 +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +) +) + +*/ +-------------------- +--UPDATE QUERIES +-------------------- +-- these queries will update the foreign key references in all (relevant) tables to the orignal account +-- +-- Generated by +/* +SELECT tc.table_schema, tc.constraint_name, tc.table_name, kcu.column_name, ccu.table_name AS foreign_table_name,ccu.column_name AS foreign_column_name, +'update ' || tc.table_name || ' ref set ' || kcu.column_name || ' = u2.id +from vdcuser u1, vdcuser u2 +where ref.' || kcu.column_name || ' = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +);' as query +FROM information_schema.table_constraints tc +JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name +JOIN information_schema.constraint_column_usage ccu ON ccu.constraint_name = tc.constraint_name +WHERE constraint_type = 'FOREIGN KEY' AND ccu.table_name='vdcuser'; +*/ +-- +-- +-- if any of the below fail because of duplicate constraints, you will need to first delete the duplicates +-- here is a sample query for deleting the duplicate entries from studyfile_vdcuser (the most likey to fail)) +/* +delete from studyfile_vdcuser +where allowedusers_id || '_' || studyfiles_id in ( +select u1.id || '_' || fu1.studyfiles_id +from vdcuser u1, vdcuser u2, studyfile_vdcuser fu1, studyfile_vdcuser fu2 +where 1=1 +and fu1.studyfiles_id = fu2.studyfiles_id +and fu1.allowedusers_id = u1.id +and fu2.allowedusers_id = u2.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser -- also may need to run with max(id) or some other combinations! +group by email +having count(*) > 1 +) +) +*/ + + +update flagged_study_comments ref set user_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.user_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update guestbookresponse ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update harvestingdataverse_vdcuser ref set allowedfileusers_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.allowedfileusers_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update networkrolerequest ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update rolerequest ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update studyaccessrequest ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update studycomment ref set commentcreator_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.commentcreator_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update study ref set creator_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.creator_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update studyfile_vdcuser ref set allowedusers_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.allowedusers_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update study ref set lastupdater_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.lastupdater_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update studylock ref set user_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.user_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update studyrequest ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update study ref set reviewer_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.reviewer_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update study_vdcuser ref set allowedusers_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.allowedusers_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdc ref set creator_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.creator_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdc_fileuser ref set allowedfileusers_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.allowedfileusers_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdcnetwork ref set creator_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.creator_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdcnetwork ref set defaultnetworkadmin_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.defaultnetworkadmin_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdc ref set reviewer_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.reviewer_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdcrole ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdcuser_studycomment ref set vdcuser_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.vdcuser_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdcuser_usergroup ref set users_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.users_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update vdc_fileuser ref set allowedfileusers_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.allowedfileusers_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +); + +update versioncontributor ref set contributor_id = u2.id +from vdcuser u1, vdcuser u2 +where ref.contributor_id = u1.id +and u1.id != u2.id +and u1.email = u2.email +and u1.email in ( +select email from vdcuser +group by email +having count(*) > 1 +) +and u2.id in ( +select min(id) from vdcuser +group by email +having count(*) > 1 +);
