diff scripts/migration/scrub_duplicate_emails.sql @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/migration/scrub_duplicate_emails.sql	Tue Sep 08 17:00:21 2015 +0200
@@ -0,0 +1,497 @@
+--------------------
+--REFERENCE QUERIES
+--------------------
+/*
+-- Query to list all user acocunts with duplicate e-mails
+select id, username, email from vdcuser
+where email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+order by email
+
+-- Query to list all e-mails that have are duplicated (total = # of actual users, without duplicates)
+select email, count(*) from vdcuser
+group by email
+having count(*) > 1
+order by count(*) desc
+
+-- Query to list all e-mails that have are duplicated and reference to original account (account with lowest id)
+select u1.id, u1.username, u1.active,u1.email, u2.id, u2.username, u2.active
+from vdcuser u1, vdcuser u2
+where 1=1
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+)
+order by u1.email
+
+-- Delete query, to be run after all the updates
+delete from vdcuser where id in (
+select u1.id
+from vdcuser u1, vdcuser u2
+where 1=1
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+)
+)
+
+*/
+--------------------
+--UPDATE QUERIES
+--------------------
+-- these queries will update the foreign key references in all (relevant) tables to the orignal account
+--
+-- Generated by
+/*
+SELECT tc.table_schema, tc.constraint_name, tc.table_name, kcu.column_name, ccu.table_name AS foreign_table_name,ccu.column_name AS foreign_column_name,
+'update ' || tc.table_name || ' ref set ' || kcu.column_name || ' = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.' || kcu.column_name || ' = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);' as query
+FROM information_schema.table_constraints tc
+JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name
+JOIN information_schema.constraint_column_usage ccu ON ccu.constraint_name = tc.constraint_name
+WHERE constraint_type = 'FOREIGN KEY' AND ccu.table_name='vdcuser';
+*/
+--
+--
+-- if any of the below fail because of duplicate constraints, you will need to first delete the duplicates
+-- here is a sample query for deleting the duplicate entries from studyfile_vdcuser (the most likey to fail))
+/*
+delete from studyfile_vdcuser
+where allowedusers_id || '_' || studyfiles_id  in (
+select u1.id || '_' || fu1.studyfiles_id 
+from vdcuser u1, vdcuser u2, studyfile_vdcuser fu1, studyfile_vdcuser fu2
+where 1=1
+and fu1.studyfiles_id = fu2.studyfiles_id
+and fu1.allowedusers_id = u1.id
+and fu2.allowedusers_id = u2.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser  -- also may need to run with max(id) or some other combinations!
+group by email
+having count(*) > 1
+)
+)
+*/
+
+
+update flagged_study_comments ref set user_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.user_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update guestbookresponse ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update harvestingdataverse_vdcuser ref set allowedfileusers_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.allowedfileusers_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update networkrolerequest ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update rolerequest ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update studyaccessrequest ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update studycomment ref set commentcreator_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.commentcreator_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update study ref set creator_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.creator_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update studyfile_vdcuser ref set allowedusers_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.allowedusers_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update study ref set lastupdater_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.lastupdater_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update studylock ref set user_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.user_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update studyrequest ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update study ref set reviewer_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.reviewer_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update study_vdcuser ref set allowedusers_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.allowedusers_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdc ref set creator_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.creator_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdc_fileuser ref set allowedfileusers_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.allowedfileusers_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdcnetwork ref set creator_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.creator_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdcnetwork ref set defaultnetworkadmin_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.defaultnetworkadmin_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdc ref set reviewer_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.reviewer_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdcrole ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdcuser_studycomment ref set vdcuser_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.vdcuser_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdcuser_usergroup ref set users_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.users_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update vdc_fileuser ref set allowedfileusers_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.allowedfileusers_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);
+
+update versioncontributor ref set contributor_id = u2.id
+from vdcuser u1, vdcuser u2
+where  ref.contributor_id = u1.id
+and u1.id != u2.id
+and u1.email = u2.email
+and u1.email in (
+select email from vdcuser
+group by email
+having count(*) > 1
+)
+and u2.id in (
+select min(id) from vdcuser
+group by email
+having count(*) > 1
+);