comparison scripts/migration/files_source_ @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
9:5926d6419569 10:a50cf11e5178
1 #!/usr/bin/perl
2
3 my $dvobjectoffset = shift @ARGV;
4
5 unless ($dvobjectoffset > 0)
6 {
7 print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET>\n";
8 exit 1;
9 }
10
11 my $filecatid = 0; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse)
12 # (offset this if adding content content to an already populated db)
13
14 use DBI;
15
16 my $host = "localhost";
17 my $username = "xxxxx";
18 my $database = "xxxxx";
19 my $password = "xxxxx";
20
21 my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password);
22
23 open PL, ">packlist.txt";
24
25 %STUDYMAP = {};
26 %STUDYFILEMAP = {};
27
28 while ( <> )
29 {
30 chop;
31 my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_);
32
33 %FILECATEGORIES = {}; # file categories for this dataset.
34
35 if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/)
36 {
37 $protocol = $1;
38 $authority = $2;
39 $identifier = $3;
40
41 # print $protocol . " " . $authority . " " . $identifier . "\n";
42 }
43 else
44 {
45 print STDERR "WARNING! illegal global id: " . $globalid . "\n";
46 next;
47 }
48
49 my $sth;
50
51 #$sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionstate = 'RELEASED'});
52 $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum});
53 $sth->execute();
54
55 my $vercount = 0;
56
57 my $sid;
58 my $svid;
59
60 while ( @foo = $sth->fetchrow() )
61 {
62 $sid = $foo[0];
63 $svid = $foo[1];
64
65 # print $sid . "\t" . $svid . "\n";
66 $vercount++;
67 }
68
69 $sth->finish;
70
71 unless ($vercount == 1)
72 {
73 print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n";
74 next;
75 }
76
77 $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid});
78
79 # print qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.subsettable, sf.originalfiletype, sf.unf FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid} . "\n";
80
81 $sth->execute();
82
83 my $newfile = 0;
84
85 while ( @foo = $sth->fetchrow() )
86 {
87 # new filemetadata fields:
88 $label = $foo[0];
89 $description = $foo[2];
90 $description =~s/\n/ /g;
91 $description = $dbh->quote($description);
92 # category:
93 $category = $foo[1];
94 # new datafile fields:
95 $type = $foo[3];
96 unless ($type =~m:/:)
97 {
98 $type = "application/octet-stream";
99 }
100 $md5 = $foo[5];
101 $restricted = $foo[6];
102 # location of the file, on the old filesystem:
103 $fslocation = $foo[4];
104 # additional info for subsettable files:
105 # (will go into the new datatable)
106 $subsettable = $foo[7];
107 $originalfiletype = $foo[8];
108 $unf = $foo[9];
109 # id of the existing studyfile:
110 $sfid = $foo[10];
111 # "class" of the existing studyfile:
112 # (tabular, "other", etc.)
113 $fileclass = $foo[11];
114 $fmid = $foo[12];
115 #print join ("\t", @_) . "\n";
116
117 if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/)
118 {
119 $preservedlabel = $label;
120 $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g;
121
122 print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n";
123 }
124
125 if ($label eq '')
126 {
127 $label = "UNKNOWN";
128 }
129
130 $label = $dbh->quote($label);
131
132
133 unless ($STUDYFILEMAP{$sfid})
134 {
135 $newfile = 1;
136 # Certain things only need to be done once per file -
137 # namely, each file needs one dvobject and datafile each;
138 # same for the datatables and variables.
139 # Other things, like filemetadatas, need to be created one
140 # per version.
141
142 $newdatafileid = ($dvobjectoffset+$sfid);
143 $STUDYFILEMAP{$sfid} = $newdatafileid;
144 ##$dvobjectoffset++;
145
146 $fsname = $fslocation;
147
148 if ($fslocation =~/^http/ )
149 {
150 $fsize = 0;
151 $fmtime = &formatTimeStamp(time);
152 }
153 else
154 {
155 if ( -f $fslocation )
156 {
157 @fstats = stat($fslocation);
158 $fsize = $fstats[7];
159 $mtime = $fstats[9];
160
161 $fmtime = &formatTimeStamp($mtime);
162 $packlistentry = $fslocation;
163 $packlistentry =~s/.*\/DVN\/data\///;
164 print PL $packlistentry . "\n";
165 }
166 else
167 {
168 print STDERR "WARNING: file " . $fslocation . " not found!\n";
169 $fsize = 0;
170 $fmtime = &formatTimeStamp(time);
171 }
172
173 $fsname =~s/^.*\///g;
174 }
175
176 # dvobject:
177
178 print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n";
179
180 # datafile object:
181
182 print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', '$fsname', $fsize, '$md5', TRUE);} . "\n";
183 }
184 else
185 {
186 $newdatafileid = $STUDYFILEMAP{$sfid};
187 $newfile = 0;
188 }
189
190 # file metadata object:
191 print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $newdatafileid);} . "\n";
192 ##print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, TRUE, 1, $dsvid, $dvobjectoffset);} . "\n";
193
194 # and the category, if exists:
195
196 if ($category && $category ne "")
197 {
198 $category = $dbh->quote($category);
199 unless ($FILECATEGORIES{$category})
200 {
201 # this is a new category (for this dataset),
202 # so it needs to be created:
203
204 $filecatid++;
205
206 print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n";
207 #print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $dvobjectoffset);} . "\n";
208
209 $FILECATEGORIES{$category} = $filecatid;
210 }
211
212 my $fcid = $FILECATEGORIES{$category};
213 print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n";
214
215 }
216
217
218 # subsettable files:
219 # (again, this only needs to be done once per file!)
220
221
222 if ($newfile && ($fileclass eq "TabularDataFile"))
223 {
224 #print STDERR "this is a subsettable file.\n";
225
226 # NOTE:
227 # there's only one datatable per file - make sure to only run this once!
228 # (i.e., not for every version!)
229
230 $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid});
231
232 $sth1->execute();
233
234 $count = 0;
235
236 while ( @dt = $sth1->fetchrow() )
237 {
238 $dtid = $dt[0];
239 $varquantity = $dt[1];
240 $casequantity = $dt[2];
241 $dtunf = $dt[3];
242 $recordspercase = $dt[4];
243
244 $count++;
245
246 unless ($unf eq $dtunf)
247 {
248 print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n";
249 }
250
251 # datatable object:
252
253
254 if ($recordspercase)
255 {
256 print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n";
257 #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $dvobjectoffset);} . "\n";
258 }
259 else
260 {
261 print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n";
262 #print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $dvobjectoffset);} . "\n";
263 }
264 }
265
266 $sth1->finish;
267
268 unless ($count == 1)
269 {
270 print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n";
271 }
272 else
273 {
274 # variables:
275 $sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid});
276
277
278 $sth1->execute();
279
280 while ( @dv = $sth1->fetchrow() )
281 {
282 $varname = $dv[0];
283 $varname = $dbh->quote($varname);
284 $varlabel = $dv[1];
285 $varlabel = $dbh->quote($varlabel);
286 $variableformattype_id = $dv[2];
287 # the old school formattype_id and
288 # intervaltype_id need to be adjusted by 1,
289 # to match the new enum values used in the
290 # 4.0 datavariables:
291 $variableformattype_id--;
292 $variableintervaltype_id = $dv[3];
293 $variableintervaltype_id--;
294 $varformatcategory = $dv[4];
295 $varformatschema = $dv[5];
296 $varformatschemaname = $dv[6];
297 $varunf = $dv[7];
298 $varfileorder = $dv[8];
299 $varweighted = $dv[9];
300 if ($varweighted)
301 {
302 $varweighted = "TRUE";
303 }
304 else
305 {
306 $varweighted = "FALSE";
307 }
308 $varorderedfactor = $dv[10];
309 if ($varorderedfactor)
310 {
311 $varorderedfactor = "TRUE";
312 }
313 else
314 {
315 $varorderedfactor = "FALSE";
316 }
317
318 $varnumberofdecimalpoints = $dv[11];
319 $varuniverse = $dv[12];
320 $varfilestartposition = $dv[13];
321 $varfileendposition = $dv[14];
322 $varrecordsegmentnumber = $dv[15];
323 $varid = $dv[16];
324
325
326
327 # new datavariable object:
328
329 $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id";
330 $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid};
331
332 if ($varformatschemaname)
333 {
334 # becomes "format":
335 $newdvfields = $newdvfields . ", format";
336 $newdvvalues = qq{$newdvvalues, '$varformatschemaname'};
337 }
338
339 if ($varformatcategory)
340 {
341 $newdvfields = $newdvfields . ", formatcategory";
342 $newdvvalues = qq{$newdvvalues, '$varformatcategory'};
343 }
344
345 if ($varfilestartposition)
346 {
347 $newdvfields = $newdvfields . ", filestartposition";
348 $newdvvalues = qq{$newdvvalues, $varfilestartposition};
349 }
350
351 if ($varfileendposition)
352 {
353 $newdvfields = $newdvfields . ", fileendposition";
354 $newdvvalues = qq{$newdvvalues, $varfileendposition};
355 }
356
357 if ($varrecordsegmentnumber)
358 {
359 $newdvfields = $newdvfields . ", recordsegmentnumber";
360 $newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber};
361 }
362
363 if ($varuniverse)
364 {
365 $newdvfields = $newdvfields . ", universe";
366 $newdvvalues = qq{$newdvvalues, '$varuniverse'};
367 }
368
369 if ($varnumberofdecimalpoints)
370 {
371 $newdvfields = $newdvfields . ", numberofdecimalpoints";
372 $newdvvalues = qq{$newdvvalues, $numberofdecimalpoints};
373 }
374
375
376 print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n";
377
378 # variable categories:
379 $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid});
380 $sth2->execute();
381
382 while ( @vc = $sth2->fetchrow() )
383 {
384 $varcatid = $vc[0];
385 $varcatlabel = $vc[1];
386 $varcatvalue = $vc[2];
387 $varcatmissing = $vc[3];
388 if ($varcatmissing)
389 {
390 $varcatmissing = "true";
391 }
392 else
393 {
394 $varcatmissing = "false";
395 }
396 $varcatorder = $vc[4];
397 unless ($varcatorder)
398 {
399 if ($varcatorder eq "" || $varcatorder != 0)
400 {
401 $varcatorder = "null";
402 }
403 }
404 $varcatfreq = $vc[5];
405 unless ($varcatfreq)
406 {
407 if ($varcatfreq eq "" || $varcatfreq != 0)
408 {
409 $varcatfreq = "null";
410 }
411 }
412
413
414 # only migrate the *real* categories:
415 if ($varcatlabel)
416 {
417 $varcatlabel = $dbh->quote($varcatlabel);
418 unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0))
419 {
420 #print STDERR "WARNING: empty var cat value (" . $varcatlabel . ")\n";
421 print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
422 }
423 else
424 {
425 $varcatvalue = $dbh->quote($varcatvalue);
426 print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
427 }
428 }
429 else
430 {
431 #print STDERR "empty var cat label.\n";
432 }
433 }
434
435 $sth2->finish;
436 }
437
438 $sth1->finish;
439 }
440
441
442
443 }
444 }
445
446 $sth->finish;
447
448 }
449
450 $dbh->disconnect;
451
452 close PL;
453
454 exit 0;
455
456 sub formatTimeStamp () {
457 my ($mtime) = (@_);
458 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime);
459
460 $year+=1900;
461 $mon++;
462
463 $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " .
464 sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec);
465
466 return $fmt;
467 }
468
469
470
471
472
473