Mercurial > hg > LGDataverses
view scripts/migration/files_source_ @ 14:be7787c36e58 default tip
new: nofity LGSercies for deleted files
author | Zoe Hong <zhong@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 02 Nov 2015 16:41:23 +0100 |
parents | a50cf11e5178 |
children |
line wrap: on
line source
#!/usr/bin/perl my $dvobjectoffset = shift @ARGV; unless ($dvobjectoffset > 0) { print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET>\n"; exit 1; } my $filecatid = 0; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse) # (offset this if adding content content to an already populated db) use DBI; my $host = "localhost"; my $username = "xxxxx"; my $database = "xxxxx"; my $password = "xxxxx"; my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password); open PL, ">packlist.txt"; %STUDYMAP = {}; %STUDYFILEMAP = {}; while ( <> ) { chop; my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_); %FILECATEGORIES = {}; # file categories for this dataset. if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/) { $protocol = $1; $authority = $2; $identifier = $3; # print $protocol . " " . $authority . " " . $identifier . "\n"; } else { print STDERR "WARNING! illegal global id: " . $globalid . "\n"; next; } my $sth; #$sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionstate = 'RELEASED'}); $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum}); $sth->execute(); my $vercount = 0; my $sid; my $svid; while ( @foo = $sth->fetchrow() ) { $sid = $foo[0]; $svid = $foo[1]; # print $sid . "\t" . $svid . "\n"; $vercount++; } $sth->finish; unless ($vercount == 1) { print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n"; next; } $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid}); # print qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid} . "\n"; $sth->execute(); my $newfile = 0; while ( @foo = $sth->fetchrow() ) { # new filemetadata fields: $label = $foo[0]; $description = $foo[2]; $description =~s/\n/ /g; $description = $dbh->quote($description); # category: $category = $foo[1]; # new datafile fields: $type = $foo[3]; unless ($type =~m:/:) { $type = "application/octet-stream"; } $md5 = $foo[5]; $restricted = $foo[6]; # location of the file, on the old filesystem: $fslocation = $foo[4]; # additional info for subsettable files: # (will go into the new datatable) $subsettable = $foo[7]; $originalfiletype = $foo[8]; $unf = $foo[9]; # id of the existing studyfile: $sfid = $foo[10]; # "class" of the existing studyfile: # (tabular, "other", etc.) $fileclass = $foo[11]; $fmid = $foo[12]; #print join ("\t", @_) . "\n"; if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/) { $preservedlabel = $label; $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g; print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n"; } if ($label eq '') { $label = "UNKNOWN"; } $label = $dbh->quote($label); unless ($STUDYFILEMAP{$sfid}) { $newfile = 1; # Certain things only need to be done once per file - # namely, each file needs one dvobject and datafile each; # same for the datatables and variables. # Other things, like filemetadatas, need to be created one # per version. $newdatafileid = ($dvobjectoffset+$sfid); $STUDYFILEMAP{$sfid} = $newdatafileid; ##$dvobjectoffset++; $fsname = $fslocation; if ($fslocation =~/^http/ ) { $fsize = 0; $fmtime = &formatTimeStamp(time); } else { if ( -f $fslocation ) { @fstats = stat($fslocation); $fsize = $fstats[7]; $mtime = $fstats[9]; $fmtime = &formatTimeStamp($mtime); $packlistentry = $fslocation; $packlistentry =~s/.*\/DVN\/data\///; print PL $packlistentry . "\n"; } else { print STDERR "WARNING: file " . $fslocation . " not found!\n"; $fsize = 0; $fmtime = &formatTimeStamp(time); } $fsname =~s/^.*\///g; } # dvobject: print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n"; # datafile object: print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', '$fsname', $fsize, '$md5', TRUE);} . "\n"; } else { $newdatafileid = $STUDYFILEMAP{$sfid}; $newfile = 0; } # file metadata object: print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $newdatafileid);} . "\n"; ##print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $dvobjectoffset);} . "\n"; # and the category, if exists: if ($category && $category ne "") { $category = $dbh->quote($category); unless ($FILECATEGORIES{$category}) { # this is a new category (for this dataset), # so it needs to be created: $filecatid++; print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n"; #print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $dvobjectoffset);} . "\n"; $FILECATEGORIES{$category} = $filecatid; } my $fcid = $FILECATEGORIES{$category}; print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n"; } # subsettable files: # (again, this only needs to be done once per file!) if ($newfile && ($fileclass eq "TabularDataFile")) { #print STDERR "this is a subsettable file.\n"; # NOTE: # there's only one datatable per file - make sure to only run this once! # (i.e., not for every version!) $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid}); $sth1->execute(); $count = 0; while ( @dt = $sth1->fetchrow() ) { $dtid = $dt[0]; $varquantity = $dt[1]; $casequantity = $dt[2]; $dtunf = $dt[3]; $recordspercase = $dt[4]; $count++; unless ($unf eq $dtunf) { print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n"; } # datatable object: if ($recordspercase) { print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n"; #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $dvobjectoffset);} . "\n"; } else { print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n"; #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $dvobjectoffset);} . "\n"; } } $sth1->finish; unless ($count == 1) { print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n"; } else { # variables: $sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid}); $sth1->execute(); while ( @dv = $sth1->fetchrow() ) { $varname = $dv[0]; $varname = $dbh->quote($varname); $varlabel = $dv[1]; $varlabel = $dbh->quote($varlabel); $variableformattype_id = $dv[2]; # the old school formattype_id and # intervaltype_id need to be adjusted by 1, # to match the new enum values used in the # 4.0 datavariables: $variableformattype_id--; $variableintervaltype_id = $dv[3]; $variableintervaltype_id--; $varformatcategory = $dv[4]; $varformatschema = $dv[5]; $varformatschemaname = $dv[6]; $varunf = $dv[7]; $varfileorder = $dv[8]; $varweighted = $dv[9]; if ($varweighted) { $varweighted = "TRUE"; } else { $varweighted = "FALSE"; } $varorderedfactor = $dv[10]; if ($varorderedfactor) { $varorderedfactor = "TRUE"; } else { $varorderedfactor = "FALSE"; } $varnumberofdecimalpoints = $dv[11]; $varuniverse = $dv[12]; $varfilestartposition = $dv[13]; $varfileendposition = $dv[14]; $varrecordsegmentnumber = $dv[15]; $varid = $dv[16]; # new datavariable object: $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id"; $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid}; if ($varformatschemaname) { # becomes "format": $newdvfields = $newdvfields . ", format"; $newdvvalues = qq{$newdvvalues, '$varformatschemaname'}; } if ($varformatcategory) { $newdvfields = $newdvfields . ", formatcategory"; $newdvvalues = qq{$newdvvalues, '$varformatcategory'}; } if ($varfilestartposition) { $newdvfields = $newdvfields . ", filestartposition"; $newdvvalues = qq{$newdvvalues, $varfilestartposition}; } if ($varfileendposition) { $newdvfields = $newdvfields . ", fileendposition"; $newdvvalues = qq{$newdvvalues, $varfileendposition}; } if ($varrecordsegmentnumber) { $newdvfields = $newdvfields . ", recordsegmentnumber"; $newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber}; } if ($varuniverse) { $newdvfields = $newdvfields . ", universe"; $newdvvalues = qq{$newdvvalues, '$varuniverse'}; } if ($varnumberofdecimalpoints) { $newdvfields = $newdvfields . ", numberofdecimalpoints"; $newdvvalues = qq{$newdvvalues, $numberofdecimalpoints}; } print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n"; # variable categories: $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid}); $sth2->execute(); while ( @vc = $sth2->fetchrow() ) { $varcatid = $vc[0]; $varcatlabel = $vc[1]; $varcatvalue = $vc[2]; $varcatmissing = $vc[3]; if ($varcatmissing) { $varcatmissing = "true"; } else { $varcatmissing = "false"; } $varcatorder = $vc[4]; unless ($varcatorder) { if ($varcatorder eq "" || $varcatorder != 0) { $varcatorder = "null"; } } $varcatfreq = $vc[5]; unless ($varcatfreq) { if ($varcatfreq eq "" || $varcatfreq != 0) { $varcatfreq = "null"; } } # only migrate the *real* categories: if ($varcatlabel) { $varcatlabel = $dbh->quote($varcatlabel); unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0)) { #print STDERR "WARNING: empty var cat value (" . $varcatlabel . ")\n"; print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n"; } else { $varcatvalue = $dbh->quote($varcatvalue); print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n"; } } else { #print STDERR "empty var cat label.\n"; } } $sth2->finish; } $sth1->finish; } } } $sth->finish; } $dbh->disconnect; close PL; exit 0; sub formatTimeStamp () { my ($mtime) = (@_); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime); $year+=1900; $mon++; $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " . sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec); return $fmt; }