Mercurial > hg > LGDataverses
diff scripts/migration/files_source_ @ 10:a50cf11e5178
Rewrite LGDataverse completely upgrading to dataverse4.0
| author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
|---|---|
| date | Tue, 08 Sep 2015 17:00:21 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/migration/files_source_ Tue Sep 08 17:00:21 2015 +0200 @@ -0,0 +1,473 @@ +#!/usr/bin/perl + +my $dvobjectoffset = shift @ARGV; + +unless ($dvobjectoffset > 0) +{ + print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET>\n"; + exit 1; +} + +my $filecatid = 0; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse) +# (offset this if adding content content to an already populated db) + +use DBI; + +my $host = "localhost"; +my $username = "xxxxx"; +my $database = "xxxxx"; +my $password = "xxxxx"; + +my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password); + +open PL, ">packlist.txt"; + +%STUDYMAP = {}; +%STUDYFILEMAP = {}; + +while ( <> ) +{ + chop; + my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_); + + %FILECATEGORIES = {}; # file categories for this dataset. + + if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/) + { + $protocol = $1; + $authority = $2; + $identifier = $3; + +# print $protocol . " " . $authority . " " . $identifier . "\n"; + } + else + { + print STDERR "WARNING! illegal global id: " . $globalid . "\n"; + next; + } + + my $sth; + + #$sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionstate = 'RELEASED'}); + $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum}); + $sth->execute(); + + my $vercount = 0; + + my $sid; + my $svid; + + while ( @foo = $sth->fetchrow() ) + { + $sid = $foo[0]; + $svid = $foo[1]; + +# print $sid . "\t" . $svid . "\n"; + $vercount++; + } + + $sth->finish; + + unless ($vercount == 1) + { + print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n"; + next; + } + + $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid}); + +# print qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid} . "\n"; + + $sth->execute(); + + my $newfile = 0; + + while ( @foo = $sth->fetchrow() ) + { + # new filemetadata fields: + $label = $foo[0]; + $description = $foo[2]; + $description =~s/\n/ /g; + $description = $dbh->quote($description); + # category: + $category = $foo[1]; + # new datafile fields: + $type = $foo[3]; + unless ($type =~m:/:) + { + $type = "application/octet-stream"; + } + $md5 = $foo[5]; + $restricted = $foo[6]; + # location of the file, on the old filesystem: + $fslocation = $foo[4]; + # additional info for subsettable files: + # (will go into the new datatable) + $subsettable = $foo[7]; + $originalfiletype = $foo[8]; + $unf = $foo[9]; + # id of the existing studyfile: + $sfid = $foo[10]; + # "class" of the existing studyfile: + # (tabular, "other", etc.) + $fileclass = $foo[11]; + $fmid = $foo[12]; + #print join ("\t", @_) . "\n"; + + if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/) + { + $preservedlabel = $label; + $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g; + + print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n"; + } + + if ($label eq '') + { + $label = "UNKNOWN"; + } + + $label = $dbh->quote($label); + + + unless ($STUDYFILEMAP{$sfid}) + { + $newfile = 1; + # Certain things only need to be done once per file - + # namely, each file needs one dvobject and datafile each; + # same for the datatables and variables. + # Other things, like filemetadatas, need to be created one + # per version. + + $newdatafileid = ($dvobjectoffset+$sfid); + $STUDYFILEMAP{$sfid} = $newdatafileid; + ##$dvobjectoffset++; + + $fsname = $fslocation; + + if ($fslocation =~/^http/ ) + { + $fsize = 0; + $fmtime = &formatTimeStamp(time); + } + else + { + if ( -f $fslocation ) + { + @fstats = stat($fslocation); + $fsize = $fstats[7]; + $mtime = $fstats[9]; + + $fmtime = &formatTimeStamp($mtime); + $packlistentry = $fslocation; + $packlistentry =~s/.*\/DVN\/data\///; + print PL $packlistentry . "\n"; + } + else + { + print STDERR "WARNING: file " . $fslocation . " not found!\n"; + $fsize = 0; + $fmtime = &formatTimeStamp(time); + } + + $fsname =~s/^.*\///g; + } + + # dvobject: + + print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n"; + + # datafile object: + + print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', '$fsname', $fsize, '$md5', TRUE);} . "\n"; + } + else + { + $newdatafileid = $STUDYFILEMAP{$sfid}; + $newfile = 0; + } + + # file metadata object: + print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $newdatafileid);} . "\n"; + ##print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $dvobjectoffset);} . "\n"; + + # and the category, if exists: + + if ($category && $category ne "") + { + $category = $dbh->quote($category); + unless ($FILECATEGORIES{$category}) + { + # this is a new category (for this dataset), + # so it needs to be created: + + $filecatid++; + + print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n"; + #print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $dvobjectoffset);} . "\n"; + + $FILECATEGORIES{$category} = $filecatid; + } + + my $fcid = $FILECATEGORIES{$category}; + print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n"; + + } + + + # subsettable files: + # (again, this only needs to be done once per file!) + + + if ($newfile && ($fileclass eq "TabularDataFile")) + { + #print STDERR "this is a subsettable file.\n"; + + # NOTE: + # there's only one datatable per file - make sure to only run this once! + # (i.e., not for every version!) + + $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid}); + + $sth1->execute(); + + $count = 0; + + while ( @dt = $sth1->fetchrow() ) + { + $dtid = $dt[0]; + $varquantity = $dt[1]; + $casequantity = $dt[2]; + $dtunf = $dt[3]; + $recordspercase = $dt[4]; + + $count++; + + unless ($unf eq $dtunf) + { + print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n"; + } + + # datatable object: + + + if ($recordspercase) + { + print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n"; + #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $dvobjectoffset);} . "\n"; + } + else + { + print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n"; + #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $dvobjectoffset);} . "\n"; + } + } + + $sth1->finish; + + unless ($count == 1) + { + print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n"; + } + else + { + # variables: + $sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid}); + + + $sth1->execute(); + + while ( @dv = $sth1->fetchrow() ) + { + $varname = $dv[0]; + $varname = $dbh->quote($varname); + $varlabel = $dv[1]; + $varlabel = $dbh->quote($varlabel); + $variableformattype_id = $dv[2]; + # the old school formattype_id and + # intervaltype_id need to be adjusted by 1, + # to match the new enum values used in the + # 4.0 datavariables: + $variableformattype_id--; + $variableintervaltype_id = $dv[3]; + $variableintervaltype_id--; + $varformatcategory = $dv[4]; + $varformatschema = $dv[5]; + $varformatschemaname = $dv[6]; + $varunf = $dv[7]; + $varfileorder = $dv[8]; + $varweighted = $dv[9]; + if ($varweighted) + { + $varweighted = "TRUE"; + } + else + { + $varweighted = "FALSE"; + } + $varorderedfactor = $dv[10]; + if ($varorderedfactor) + { + $varorderedfactor = "TRUE"; + } + else + { + $varorderedfactor = "FALSE"; + } + + $varnumberofdecimalpoints = $dv[11]; + $varuniverse = $dv[12]; + $varfilestartposition = $dv[13]; + $varfileendposition = $dv[14]; + $varrecordsegmentnumber = $dv[15]; + $varid = $dv[16]; + + + + # new datavariable object: + + $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id"; + $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid}; + + if ($varformatschemaname) + { + # becomes "format": + $newdvfields = $newdvfields . ", format"; + $newdvvalues = qq{$newdvvalues, '$varformatschemaname'}; + } + + if ($varformatcategory) + { + $newdvfields = $newdvfields . ", formatcategory"; + $newdvvalues = qq{$newdvvalues, '$varformatcategory'}; + } + + if ($varfilestartposition) + { + $newdvfields = $newdvfields . ", filestartposition"; + $newdvvalues = qq{$newdvvalues, $varfilestartposition}; + } + + if ($varfileendposition) + { + $newdvfields = $newdvfields . ", fileendposition"; + $newdvvalues = qq{$newdvvalues, $varfileendposition}; + } + + if ($varrecordsegmentnumber) + { + $newdvfields = $newdvfields . ", recordsegmentnumber"; + $newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber}; + } + + if ($varuniverse) + { + $newdvfields = $newdvfields . ", universe"; + $newdvvalues = qq{$newdvvalues, '$varuniverse'}; + } + + if ($varnumberofdecimalpoints) + { + $newdvfields = $newdvfields . ", numberofdecimalpoints"; + $newdvvalues = qq{$newdvvalues, $numberofdecimalpoints}; + } + + + print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n"; + + # variable categories: + $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid}); + $sth2->execute(); + + while ( @vc = $sth2->fetchrow() ) + { + $varcatid = $vc[0]; + $varcatlabel = $vc[1]; + $varcatvalue = $vc[2]; + $varcatmissing = $vc[3]; + if ($varcatmissing) + { + $varcatmissing = "true"; + } + else + { + $varcatmissing = "false"; + } + $varcatorder = $vc[4]; + unless ($varcatorder) + { + if ($varcatorder eq "" || $varcatorder != 0) + { + $varcatorder = "null"; + } + } + $varcatfreq = $vc[5]; + unless ($varcatfreq) + { + if ($varcatfreq eq "" || $varcatfreq != 0) + { + $varcatfreq = "null"; + } + } + + + # only migrate the *real* categories: + if ($varcatlabel) + { + $varcatlabel = $dbh->quote($varcatlabel); + unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0)) + { + #print STDERR "WARNING: empty var cat value (" . $varcatlabel . ")\n"; + print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n"; + } + else + { + $varcatvalue = $dbh->quote($varcatvalue); + print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n"; + } + } + else + { + #print STDERR "empty var cat label.\n"; + } + } + + $sth2->finish; + } + + $sth1->finish; + } + + + + } + } + + $sth->finish; + +} + +$dbh->disconnect; + +close PL; + +exit 0; + +sub formatTimeStamp () { + my ($mtime) = (@_); + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime); + + $year+=1900; + $mon++; + + $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " . + sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec); + + return $fmt; +} + + + + + +
