diff scripts/migration/files_source_ @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/migration/files_source_	Tue Sep 08 17:00:21 2015 +0200
@@ -0,0 +1,473 @@
+#!/usr/bin/perl
+
+my $dvobjectoffset = shift @ARGV;
+
+unless ($dvobjectoffset > 0)
+{
+    print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET>\n";
+    exit 1;
+}
+
+my $filecatid = 0; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse)
+# (offset this if adding content content to an already populated db)
+
+use DBI;
+
+my $host = "localhost"; 
+my $username = "xxxxx";
+my $database = "xxxxx";
+my $password = "xxxxx";
+
+my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password); 
+
+open PL, ">packlist.txt";
+
+%STUDYMAP = {};
+%STUDYFILEMAP = {};
+
+while ( <> )
+{
+    chop; 
+    my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_); 
+
+    %FILECATEGORIES = {}; # file categories for this dataset.
+
+    if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/)
+    {
+	$protocol = $1;
+	$authority = $2; 
+	$identifier = $3;
+
+#	print $protocol . " " . $authority . " " . $identifier . "\n";
+    }
+    else 
+    {
+	print STDERR "WARNING! illegal global id: " . $globalid . "\n";
+	next; 
+    }
+
+    my $sth; 
+
+    #$sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionstate = 'RELEASED'}); 
+    $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum}); 
+    $sth->execute();
+
+    my $vercount = 0; 
+
+    my $sid; 
+    my $svid; 
+
+    while ( @foo = $sth->fetchrow() )
+    {
+	$sid = $foo[0];
+	$svid = $foo[1];
+
+#	print $sid . "\t" . $svid . "\n";
+	$vercount++;
+    }
+
+    $sth->finish; 
+
+    unless ($vercount == 1) 
+    {
+	print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n";
+	next; 
+    }
+
+    $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid});
+
+#    print qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid} . "\n";
+
+    $sth->execute();
+
+    my $newfile = 0; 
+
+    while ( @foo = $sth->fetchrow() )
+    {
+	# new filemetadata fields: 
+	$label = $foo[0];
+	$description = $foo[2];
+	$description =~s/\n/ /g;
+	$description = $dbh->quote($description);
+	# category: 
+	$category = $foo[1];
+	# new datafile fields: 
+	$type = $foo[3];
+	unless ($type =~m:/:)
+	{
+	    $type = "application/octet-stream";
+	}
+	$md5 = $foo[5];
+	$restricted = $foo[6];
+	# location of the file, on the old filesystem: 
+	$fslocation = $foo[4];
+	# additional info for subsettable files: 
+	# (will go into the new datatable)
+	$subsettable = $foo[7]; 
+	$originalfiletype = $foo[8];
+	$unf = $foo[9]; 
+	# id of the existing studyfile: 
+	$sfid = $foo[10];
+	# "class" of the existing studyfile: 
+	# (tabular, "other", etc.)
+	$fileclass = $foo[11];
+	$fmid = $foo[12];
+	#print join ("\t", @_) . "\n";
+
+	if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/)
+	{
+	    $preservedlabel = $label; 
+	    $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g;
+	    
+	    print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n";
+	}
+
+	if ($label eq '')
+	{
+	    $label = "UNKNOWN";
+	}
+
+	$label = $dbh->quote($label);
+
+
+	unless ($STUDYFILEMAP{$sfid})
+	{
+	    $newfile = 1; 
+	    # Certain things only need to be done once per file -
+	    # namely, each file needs one dvobject and datafile each;
+	    # same for the datatables and variables. 
+	    # Other things, like filemetadatas, need to be created one 
+	    # per version. 
+	    
+	    $newdatafileid = ($dvobjectoffset+$sfid);
+	    $STUDYFILEMAP{$sfid} = $newdatafileid; 
+	    ##$dvobjectoffset++; 
+
+	    $fsname = $fslocation; 
+
+	    if ($fslocation =~/^http/ )
+	    {
+		$fsize = 0;
+		$fmtime = &formatTimeStamp(time);
+	    }
+	    else 
+	    {
+		if ( -f $fslocation ) 
+		{
+		    @fstats = stat($fslocation); 
+		    $fsize = $fstats[7]; 
+		    $mtime = $fstats[9]; 
+
+		    $fmtime = &formatTimeStamp($mtime);
+		    $packlistentry = $fslocation; 
+		    $packlistentry =~s/.*\/DVN\/data\///; 
+		    print PL $packlistentry . "\n";
+		}
+		else 
+		{
+		    print STDERR "WARNING: file " . $fslocation . " not found!\n";
+		    $fsize = 0;
+		    $fmtime = &formatTimeStamp(time);
+		}
+		
+		$fsname =~s/^.*\///g; 
+	    }
+
+	    # dvobject: 
+
+	    print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n";
+
+	    # datafile object:
+
+	    print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', '$fsname', $fsize, '$md5', TRUE);} . "\n";
+	}
+	else 
+	{
+	    $newdatafileid = $STUDYFILEMAP{$sfid};
+	    $newfile = 0; 
+	}
+	    
+	# file metadata object:
+	print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $newdatafileid);} . "\n";
+	##print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $dvobjectoffset);} . "\n";
+
+	# and the category, if exists:
+
+	if ($category && $category ne "") 
+	{
+	    $category = $dbh->quote($category); 
+	    unless ($FILECATEGORIES{$category})
+	    {
+		# this is a new category (for this dataset), 
+		# so it needs to be created: 
+
+		$filecatid++;
+
+		print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n";
+		#print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $dvobjectoffset);} . "\n";
+
+		$FILECATEGORIES{$category} = $filecatid; 
+	    }
+
+	    my $fcid = $FILECATEGORIES{$category};
+	    print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n";
+
+	}
+
+
+	# subsettable files: 
+	# (again, this only needs to be done once per file!)
+
+
+	if ($newfile && ($fileclass eq "TabularDataFile"))
+	{
+	    #print STDERR "this is a subsettable file.\n";	
+
+	    # NOTE: 
+	    # there's only one datatable per file - make sure to only run this once!
+	    # (i.e., not for every version!)
+	    
+	    $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid});
+
+	    $sth1->execute();
+
+	    $count = 0; 
+
+	    while ( @dt = $sth1->fetchrow() )
+	    {
+		$dtid = $dt[0];
+		$varquantity = $dt[1];
+		$casequantity = $dt[2];
+		$dtunf = $dt[3];
+		$recordspercase = $dt[4];
+
+		$count++;
+
+		unless ($unf eq $dtunf) 
+		{
+		    print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n";
+		}
+
+		# datatable object:
+
+
+		if ($recordspercase) 
+		{
+		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n";
+		    #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $dvobjectoffset);} . "\n";
+		}
+		else
+		{
+		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n";
+		    #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $dvobjectoffset);} . "\n";
+		}
+	    }
+
+	    $sth1->finish;
+	    
+	    unless ($count == 1) 
+	    {
+		print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n";
+	    }
+	    else 
+	    {
+		# variables:
+		$sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid});
+
+
+		$sth1->execute();
+		
+		while ( @dv = $sth1->fetchrow() )
+		{
+		    $varname = $dv[0];
+		    $varname = $dbh->quote($varname);
+		    $varlabel = $dv[1];
+		    $varlabel = $dbh->quote($varlabel);
+		    $variableformattype_id = $dv[2];
+		    # the old school formattype_id and 
+		    # intervaltype_id need to be adjusted by 1, 
+		    # to match the new enum values used in the
+		    # 4.0 datavariables:
+		    $variableformattype_id--;
+		    $variableintervaltype_id = $dv[3];
+		    $variableintervaltype_id--;
+		    $varformatcategory = $dv[4];
+		    $varformatschema = $dv[5];
+		    $varformatschemaname = $dv[6];
+		    $varunf = $dv[7];
+		    $varfileorder = $dv[8];
+		    $varweighted = $dv[9];
+		    if ($varweighted)
+		    {
+			$varweighted = "TRUE";
+		    }
+		    else 
+		    {
+			$varweighted = "FALSE";
+		    }
+		    $varorderedfactor  = $dv[10];
+		    if ($varorderedfactor)
+		    {
+			$varorderedfactor = "TRUE";
+		    }
+		    else 
+		    {
+			$varorderedfactor = "FALSE";
+		    }
+
+		    $varnumberofdecimalpoints = $dv[11];
+		    $varuniverse = $dv[12];
+		    $varfilestartposition = $dv[13];
+		    $varfileendposition = $dv[14];
+		    $varrecordsegmentnumber = $dv[15];
+		    $varid = $dv[16];
+
+		    
+
+		    # new datavariable object: 
+
+		    $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id";
+		    $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid};
+
+		    if ($varformatschemaname)
+		    {
+			# becomes "format":
+			$newdvfields = $newdvfields . ", format";
+			$newdvvalues = qq{$newdvvalues, '$varformatschemaname'};
+		    }
+
+		    if ($varformatcategory)
+		    {
+			$newdvfields = $newdvfields . ", formatcategory";
+			$newdvvalues = qq{$newdvvalues, '$varformatcategory'};
+		    }
+
+		    if ($varfilestartposition)
+		    {
+			$newdvfields = $newdvfields . ", filestartposition";
+			$newdvvalues = qq{$newdvvalues, $varfilestartposition};
+		    }
+
+		    if ($varfileendposition)
+		    {
+			$newdvfields = $newdvfields . ", fileendposition";
+			$newdvvalues = qq{$newdvvalues, $varfileendposition};
+		    }
+
+		    if ($varrecordsegmentnumber)
+		    {
+			$newdvfields = $newdvfields . ", recordsegmentnumber";
+			$newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber};
+		    }
+
+		    if ($varuniverse)
+		    {
+			$newdvfields = $newdvfields . ", universe";
+			$newdvvalues = qq{$newdvvalues, '$varuniverse'};
+		    }
+
+		    if ($varnumberofdecimalpoints)
+		    {
+			$newdvfields = $newdvfields . ", numberofdecimalpoints";
+			$newdvvalues = qq{$newdvvalues, $numberofdecimalpoints};
+		    }
+
+
+		    print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n";
+
+		    # variable categories: 
+		    $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid});
+		    $sth2->execute();
+		
+		    while ( @vc = $sth2->fetchrow() )
+		    {
+			$varcatid = $vc[0];
+			$varcatlabel = $vc[1];
+			$varcatvalue = $vc[2];
+			$varcatmissing = $vc[3];
+			if ($varcatmissing)
+			{
+			    $varcatmissing = "true";
+			}
+			else 
+			{
+			    $varcatmissing = "false";
+			}
+			$varcatorder = $vc[4];
+			unless ($varcatorder) 
+			{
+			    if ($varcatorder eq "" || $varcatorder != 0)
+			    {
+				$varcatorder = "null";
+			    }
+			}
+			$varcatfreq = $vc[5];
+			unless ($varcatfreq) 
+			{
+			    if ($varcatfreq eq "" || $varcatfreq != 0)
+			    {
+				$varcatfreq = "null";
+			    }
+			}
+
+
+			# only migrate the *real* categories: 
+			if ($varcatlabel) 
+			{
+			    $varcatlabel = $dbh->quote($varcatlabel);
+			    unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0))
+			    {
+				#print STDERR "WARNING: empty var cat value (" . $varcatlabel . ")\n";
+				print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
+			    }
+			    else 
+			    {
+				$varcatvalue = $dbh->quote($varcatvalue);
+				print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
+			    }
+			}
+			else 
+			{
+			    #print STDERR "empty var cat label.\n";
+			}
+		    }
+
+		    $sth2->finish;
+		}
+
+		$sth1->finish;
+	    }
+
+
+		
+	}
+    }
+
+    $sth->finish; 
+
+}
+
+$dbh->disconnect; 
+
+close PL; 
+
+exit 0; 
+
+sub formatTimeStamp () {
+    my ($mtime) = (@_);
+    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime);
+
+    $year+=1900;
+    $mon++;
+
+    $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " . 
+	sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec); 
+
+    return $fmt;
+}
+
+
+
+
+
+