view scripts/migration/files_source_ @ 14:be7787c36e58 default tip

new: nofity LGSercies for deleted files
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Mon, 02 Nov 2015 16:41:23 +0100
parents a50cf11e5178
children
line wrap: on
line source

#!/usr/bin/perl

my $dvobjectoffset = shift @ARGV;

unless ($dvobjectoffset > 0)
{
    print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET>\n";
    exit 1;
}

my $filecatid = 0; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse)
# (offset this if adding content content to an already populated db)

use DBI;

my $host = "localhost"; 
my $username = "xxxxx";
my $database = "xxxxx";
my $password = "xxxxx";

my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password); 

open PL, ">packlist.txt";

%STUDYMAP = {};
%STUDYFILEMAP = {};

while ( <> )
{
    chop; 
    my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_); 

    %FILECATEGORIES = {}; # file categories for this dataset.

    if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/)
    {
	$protocol = $1;
	$authority = $2; 
	$identifier = $3;

#	print $protocol . " " . $authority . " " . $identifier . "\n";
    }
    else 
    {
	print STDERR "WARNING! illegal global id: " . $globalid . "\n";
	next; 
    }

    my $sth; 

    #$sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionstate = 'RELEASED'}); 
    $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum}); 
    $sth->execute();

    my $vercount = 0; 

    my $sid; 
    my $svid; 

    while ( @foo = $sth->fetchrow() )
    {
	$sid = $foo[0];
	$svid = $foo[1];

#	print $sid . "\t" . $svid . "\n";
	$vercount++;
    }

    $sth->finish; 

    unless ($vercount == 1) 
    {
	print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n";
	next; 
    }

    $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid});

#    print qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid} . "\n";

    $sth->execute();

    my $newfile = 0; 

    while ( @foo = $sth->fetchrow() )
    {
	# new filemetadata fields: 
	$label = $foo[0];
	$description = $foo[2];
	$description =~s/\n/ /g;
	$description = $dbh->quote($description);
	# category: 
	$category = $foo[1];
	# new datafile fields: 
	$type = $foo[3];
	unless ($type =~m:/:)
	{
	    $type = "application/octet-stream";
	}
	$md5 = $foo[5];
	$restricted = $foo[6];
	# location of the file, on the old filesystem: 
	$fslocation = $foo[4];
	# additional info for subsettable files: 
	# (will go into the new datatable)
	$subsettable = $foo[7]; 
	$originalfiletype = $foo[8];
	$unf = $foo[9]; 
	# id of the existing studyfile: 
	$sfid = $foo[10];
	# "class" of the existing studyfile: 
	# (tabular, "other", etc.)
	$fileclass = $foo[11];
	$fmid = $foo[12];
	#print join ("\t", @_) . "\n";

	if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/)
	{
	    $preservedlabel = $label; 
	    $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g;
	    
	    print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n";
	}

	if ($label eq '')
	{
	    $label = "UNKNOWN";
	}

	$label = $dbh->quote($label);


	unless ($STUDYFILEMAP{$sfid})
	{
	    $newfile = 1; 
	    # Certain things only need to be done once per file -
	    # namely, each file needs one dvobject and datafile each;
	    # same for the datatables and variables. 
	    # Other things, like filemetadatas, need to be created one 
	    # per version. 
	    
	    $newdatafileid = ($dvobjectoffset+$sfid);
	    $STUDYFILEMAP{$sfid} = $newdatafileid; 
	    ##$dvobjectoffset++; 

	    $fsname = $fslocation; 

	    if ($fslocation =~/^http/ )
	    {
		$fsize = 0;
		$fmtime = &formatTimeStamp(time);
	    }
	    else 
	    {
		if ( -f $fslocation ) 
		{
		    @fstats = stat($fslocation); 
		    $fsize = $fstats[7]; 
		    $mtime = $fstats[9]; 

		    $fmtime = &formatTimeStamp($mtime);
		    $packlistentry = $fslocation; 
		    $packlistentry =~s/.*\/DVN\/data\///; 
		    print PL $packlistentry . "\n";
		}
		else 
		{
		    print STDERR "WARNING: file " . $fslocation . " not found!\n";
		    $fsize = 0;
		    $fmtime = &formatTimeStamp(time);
		}
		
		$fsname =~s/^.*\///g; 
	    }

	    # dvobject: 

	    print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n";

	    # datafile object:

	    print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', '$fsname', $fsize, '$md5', TRUE);} . "\n";
	}
	else 
	{
	    $newdatafileid = $STUDYFILEMAP{$sfid};
	    $newfile = 0; 
	}
	    
	# file metadata object:
	print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $newdatafileid);} . "\n";
	##print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $dvobjectoffset);} . "\n";

	# and the category, if exists:

	if ($category && $category ne "") 
	{
	    $category = $dbh->quote($category); 
	    unless ($FILECATEGORIES{$category})
	    {
		# this is a new category (for this dataset), 
		# so it needs to be created: 

		$filecatid++;

		print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n";
		#print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $dvobjectoffset);} . "\n";

		$FILECATEGORIES{$category} = $filecatid; 
	    }

	    my $fcid = $FILECATEGORIES{$category};
	    print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n";

	}


	# subsettable files: 
	# (again, this only needs to be done once per file!)


	if ($newfile && ($fileclass eq "TabularDataFile"))
	{
	    #print STDERR "this is a subsettable file.\n";	

	    # NOTE: 
	    # there's only one datatable per file - make sure to only run this once!
	    # (i.e., not for every version!)
	    
	    $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid});

	    $sth1->execute();

	    $count = 0; 

	    while ( @dt = $sth1->fetchrow() )
	    {
		$dtid = $dt[0];
		$varquantity = $dt[1];
		$casequantity = $dt[2];
		$dtunf = $dt[3];
		$recordspercase = $dt[4];

		$count++;

		unless ($unf eq $dtunf) 
		{
		    print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n";
		}

		# datatable object:


		if ($recordspercase) 
		{
		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n";
		    #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $dvobjectoffset);} . "\n";
		}
		else
		{
		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n";
		    #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $dvobjectoffset);} . "\n";
		}
	    }

	    $sth1->finish;
	    
	    unless ($count == 1) 
	    {
		print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n";
	    }
	    else 
	    {
		# variables:
		$sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid});


		$sth1->execute();
		
		while ( @dv = $sth1->fetchrow() )
		{
		    $varname = $dv[0];
		    $varname = $dbh->quote($varname);
		    $varlabel = $dv[1];
		    $varlabel = $dbh->quote($varlabel);
		    $variableformattype_id = $dv[2];
		    # the old school formattype_id and 
		    # intervaltype_id need to be adjusted by 1, 
		    # to match the new enum values used in the
		    # 4.0 datavariables:
		    $variableformattype_id--;
		    $variableintervaltype_id = $dv[3];
		    $variableintervaltype_id--;
		    $varformatcategory = $dv[4];
		    $varformatschema = $dv[5];
		    $varformatschemaname = $dv[6];
		    $varunf = $dv[7];
		    $varfileorder = $dv[8];
		    $varweighted = $dv[9];
		    if ($varweighted)
		    {
			$varweighted = "TRUE";
		    }
		    else 
		    {
			$varweighted = "FALSE";
		    }
		    $varorderedfactor  = $dv[10];
		    if ($varorderedfactor)
		    {
			$varorderedfactor = "TRUE";
		    }
		    else 
		    {
			$varorderedfactor = "FALSE";
		    }

		    $varnumberofdecimalpoints = $dv[11];
		    $varuniverse = $dv[12];
		    $varfilestartposition = $dv[13];
		    $varfileendposition = $dv[14];
		    $varrecordsegmentnumber = $dv[15];
		    $varid = $dv[16];

		    

		    # new datavariable object: 

		    $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id";
		    $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid};

		    if ($varformatschemaname)
		    {
			# becomes "format":
			$newdvfields = $newdvfields . ", format";
			$newdvvalues = qq{$newdvvalues, '$varformatschemaname'};
		    }

		    if ($varformatcategory)
		    {
			$newdvfields = $newdvfields . ", formatcategory";
			$newdvvalues = qq{$newdvvalues, '$varformatcategory'};
		    }

		    if ($varfilestartposition)
		    {
			$newdvfields = $newdvfields . ", filestartposition";
			$newdvvalues = qq{$newdvvalues, $varfilestartposition};
		    }

		    if ($varfileendposition)
		    {
			$newdvfields = $newdvfields . ", fileendposition";
			$newdvvalues = qq{$newdvvalues, $varfileendposition};
		    }

		    if ($varrecordsegmentnumber)
		    {
			$newdvfields = $newdvfields . ", recordsegmentnumber";
			$newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber};
		    }

		    if ($varuniverse)
		    {
			$newdvfields = $newdvfields . ", universe";
			$newdvvalues = qq{$newdvvalues, '$varuniverse'};
		    }

		    if ($varnumberofdecimalpoints)
		    {
			$newdvfields = $newdvfields . ", numberofdecimalpoints";
			$newdvvalues = qq{$newdvvalues, $numberofdecimalpoints};
		    }


		    print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n";

		    # variable categories: 
		    $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid});
		    $sth2->execute();
		
		    while ( @vc = $sth2->fetchrow() )
		    {
			$varcatid = $vc[0];
			$varcatlabel = $vc[1];
			$varcatvalue = $vc[2];
			$varcatmissing = $vc[3];
			if ($varcatmissing)
			{
			    $varcatmissing = "true";
			}
			else 
			{
			    $varcatmissing = "false";
			}
			$varcatorder = $vc[4];
			unless ($varcatorder) 
			{
			    if ($varcatorder eq "" || $varcatorder != 0)
			    {
				$varcatorder = "null";
			    }
			}
			$varcatfreq = $vc[5];
			unless ($varcatfreq) 
			{
			    if ($varcatfreq eq "" || $varcatfreq != 0)
			    {
				$varcatfreq = "null";
			    }
			}


			# only migrate the *real* categories: 
			if ($varcatlabel) 
			{
			    $varcatlabel = $dbh->quote($varcatlabel);
			    unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0))
			    {
				#print STDERR "WARNING: empty var cat value (" . $varcatlabel . ")\n";
				print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
			    }
			    else 
			    {
				$varcatvalue = $dbh->quote($varcatvalue);
				print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
			    }
			}
			else 
			{
			    #print STDERR "empty var cat label.\n";
			}
		    }

		    $sth2->finish;
		}

		$sth1->finish;
	    }


		
	}
    }

    $sth->finish; 

}

$dbh->disconnect; 

close PL; 

exit 0; 

sub formatTimeStamp () {
    my ($mtime) = (@_);
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime);

    $year+=1900;
    $mon++;

    $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " . 
	sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec); 

    return $fmt;
}