Mercurial > hg > foxridge-archiver
annotate makemeta-vlp.pl @ 60:5bee75ca9eb3 default tip
added old makemeta-quantum.pl that was not in CVS.
author | casties |
---|---|
date | Thu, 16 Mar 2017 18:29:58 +0100 |
parents | 2208ed7370cb |
children |
rev | line source |
---|---|
57 | 1 #!/usr/bin/perl -w |
21 | 2 |
3 use strict; | |
4 use XML::LibXML; | |
5 | |
22 | 6 use lib '/usr/local/mpiwg/archive'; |
21 | 7 use MPIWGStor; |
8 | |
9 # make output unbuffered | |
10 $|=1; | |
11 | |
12 # program version | |
56 | 13 my $version = "0.2.7 (27.8.2010 ROC)"; |
21 | 14 my $help = |
15 "use: makemeta-vlp [options] file.xml | |
16 options: | |
17 -debug show debugging info | |
18 -dry-run simulate, dont'do anything | |
19 -replace replace existing index files | |
20 -online-mode mode for creating online/permanent files | |
21 -archive-mode mode for creating archive/data files | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
22 -access=free adds free access tag for online-mode |
56 | 23 -texttool adds texttool tag for online-mode |
21 | 24 "; |
25 logger("INFO", "makemeta-vlp $version"); | |
26 | |
27 ########################################### | |
28 # mappings | |
29 | |
30 # generic mappings at top level | |
31 my %gen_map = ( | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
32 'Custom2_Language' => 'meta/lang', |
48 | 33 'productionComment' => 'meta/image-acquisition/production-comment', |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
34 'derivedFrom' => 'derived-from/archive-path' |
21 | 35 ); |
36 # sub type switch tag | |
37 my %type_map = ( | |
38 'ReferenceType' => 'meta/bib@type' | |
39 ); | |
40 # sub type mappings | |
41 my %subtype_map = ( | |
42 'Book' => { | |
43 '_name' => 'book', | |
44 'Author' => 'meta/bib/author', | |
45 'Title' => 'meta/bib/title', | |
46 'Year' => 'meta/bib/year', | |
47 'Place_Published' => 'meta/bib/city', | |
48 'Publisher' => 'meta/bib/publisher', | |
49 'Edition' => 'meta/bib/edition', | |
50 'Volume' => 'meta/bib/volume', | |
51 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
52 'Pages' => 'meta/bib/number-of-pages' | |
53 }, | |
31 | 54 '(Book)' => { |
55 '_name' => 'book', | |
56 'Author' => 'meta/bib/author', | |
57 'Title' => 'meta/bib/title', | |
58 'Year' => 'meta/bib/year', | |
59 'Place_Published' => 'meta/bib/city', | |
60 'Publisher' => 'meta/bib/publisher', | |
61 'Edition' => 'meta/bib/edition', | |
62 'Volume' => 'meta/bib/volume', | |
63 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
64 'Pages' => 'meta/bib/number-of-pages', | |
65 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
66 }, | |
21 | 67 'Book Section' => { |
68 '_name' => 'inbook', | |
69 'Author' => 'meta/bib/author', | |
70 'Title' => 'meta/bib/title', | |
71 'Year' => 'meta/bib/year', | |
31 | 72 'SecondaryTitle' => 'meta/bib/book-title', |
21 | 73 'SecondaryAuthor' => 'meta/bib/editor', |
74 'Volume' => 'meta/bib/volume', | |
75 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
76 'Pages' => 'meta/bib/pages' | |
77 }, | |
78 'Edited Book' => { | |
79 '_name' => 'edited-book', | |
80 'Author' => 'meta/bib/editor', | |
81 'Title' => 'meta/bib/title', | |
82 'Year' => 'meta/bib/year', | |
83 'Place_Published' => 'meta/bib/city', | |
84 'Publisher' => 'meta/bib/publisher', | |
85 'Edition' => 'meta/bib/edition', | |
86 'Volume' => 'meta/bib/volume', | |
87 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
33 | 88 'Pages' => 'meta/bib/number-of-pages', |
89 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
90 }, | |
91 '(Edited Book)' => { | |
92 '_name' => 'edited-book', | |
93 'Author' => 'meta/bib/editor', | |
94 'Title' => 'meta/bib/title', | |
95 'Year' => 'meta/bib/year', | |
96 'Place_Published' => 'meta/bib/city', | |
97 'Publisher' => 'meta/bib/publisher', | |
98 'Edition' => 'meta/bib/edition', | |
99 'Volume' => 'meta/bib/volume', | |
100 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
21 | 101 'Pages' => 'meta/bib/number-of-pages' |
102 }, | |
103 'Journal Article' => { | |
104 '_name' => 'journal-article', | |
105 'Author' => 'meta/bib/author', | |
106 'Title' => 'meta/bib/title', | |
107 'Year' => 'meta/bib/year', | |
108 'SecondaryTitle' => 'meta/bib/journal', | |
109 'Volume' => 'meta/bib/volume', | |
110 'Number_Issue' => 'meta/bib/issue', | |
111 'Pages' => 'meta/bib/pages' | |
112 }, | |
38 | 113 '(JournalVolume)' => { |
114 '_name' => 'journal-volume', | |
115 'SecondaryTitle' => 'meta/bib/title', | |
116 'SecondaryAuthor' => 'meta/bib/editor', | |
117 'Publisher' => 'meta/bib/publisher', | |
118 'Place_Published' => 'meta/bib/city', | |
119 'Year' => 'meta/bib/year', | |
120 'Volume' => 'meta/bib/volume', | |
121 'Pages' => 'meta/bib/number-of-pages', | |
122 '#Cover pages only, articles have been extracted' => 'meta/bib/comment' | |
123 }, | |
56 | 124 'Journal' => { |
125 '_name' => 'report', | |
126 'Title' => 'meta/bib/title', | |
127 'SecondaryTitle' => 'meta/bib/institution', | |
128 'Author' => 'meta/bib/author', | |
129 'Place_Published' => 'meta/bib/city', | |
130 'Year' => 'meta/bib/year', | |
131 'Date' => 'meta/bib/date', | |
132 'Pages' => 'meta/bib/pages', | |
133 }, | |
21 | 134 'Magazine Article' => { |
135 '_name' => 'magazine-article', | |
136 'Author' => 'meta/bib/author', | |
137 'Title' => 'meta/bib/title', | |
138 'Year' => 'meta/bib/year', | |
139 'Secondary_Title' => 'meta/bib/magazine', | |
140 'Number_Issue' => 'meta/bib/issue-number', | |
141 'Date' => 'meta/bib/issue-date', | |
142 'Pages' => 'meta/bib/pages' | |
143 }, | |
56 | 144 'Newspaper Article' => { |
145 '_name' => 'newspaper-article', | |
146 'Author' => 'meta/bib/author', | |
147 'Title' => 'meta/bib/title', | |
148 'Year' => 'meta/bib/year', | |
149 'Secondary_Title' => 'meta/bib/newspaper', | |
150 'Date' => 'meta/bib/issue-date', | |
151 'Pages' => 'meta/bib/pages' | |
152 }, | |
21 | 153 'Report' => { |
154 '_name' => 'report', | |
155 'Author' => 'meta/bib/author', | |
156 'Title' => 'meta/bib/title', | |
157 'Year' => 'meta/bib/year', | |
158 'Place_Published' => 'meta/bib/city', | |
159 'Date' => 'meta/bib/date', | |
160 'SecondaryTitle' => 'meta/bib/type', | |
161 'Pages' => 'meta/bib/pages' | |
162 }, | |
163 'Trade Catalogue' => { | |
164 '_name' => 'report', | |
165 'Author' => 'meta/bib/author', | |
166 'Title' => 'meta/bib/title', | |
167 'Year' => 'meta/bib/year', | |
168 'Place_Published' => 'meta/bib/city', | |
169 'Date' => 'meta/bib/date', | |
170 'Volume' => 'meta/bib/volume', | |
171 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
172 'ReferenceType' => 'meta/bib/type', | |
173 'Pages' => 'meta/bib/pages' | |
174 }, | |
175 'Thesis' => { | |
176 '_name' => 'thesis', | |
177 'Author' => 'meta/bib/author', | |
178 'Title' => 'meta/bib/title', | |
179 'Place_Published' => 'meta/bib/city', | |
180 'Publisher' => 'meta/bib/university', | |
181 'Date' => 'meta/bib/date', | |
182 'TypeOfWork' => 'meta/bib/type', | |
183 'Pages' => 'meta/bib/number-of-pages' | |
184 }, | |
185 'Manuscript' => { | |
186 '_name' => 'manuscript', | |
187 'Author' => 'meta/bib/author', | |
188 'Title' => 'meta/bib/title', | |
189 'Year' => 'meta/bib/year', | |
190 'Place_Published' => 'meta/bib/location', | |
191 'Pages' => 'meta/bib/pages' | |
192 } | |
193 ); | |
194 # language element | |
195 my $lang_field = 'Custom2_Language'; | |
196 # languages to iso codes | |
197 my %lang_map = ( | |
198 'German' => 'de', | |
199 'English' => 'en', | |
200 'Italian' => 'it', | |
201 'French' => 'fr', | |
202 'Latin' => 'la', | |
203 'Japanese' => 'ja', | |
204 'Dutch' => 'nl', | |
205 'Spanish' => 'es', | |
54 | 206 'Swedish' => 'sv', |
207 'Russian' => 'ru', | |
208 'Polish' => 'pl', | |
209 'Greek' => 'el' | |
21 | 210 ); |
211 # storage fields | |
212 my $arch_id_field = 'ID'; | |
56 | 213 my $access_free_field = 'online'; |
21 | 214 |
215 ####################################################### | |
216 # internal parameters | |
217 # | |
218 | |
219 # storage | |
220 my $lib_arch_dir = '/mpiwg/archive/data/vlp'; | |
221 my $lib_online_dir = '/mpiwg/online/permanent/vlp'; | |
222 | |
223 # read command line parameters | |
224 my $args = MPIWGStor::parseargs; | |
225 if (! scalar(%$args)) { | |
226 print $help, "\n"; | |
227 exit 1; | |
228 } | |
229 | |
230 # debug level | |
231 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; | |
232 | |
233 # simulate action only | |
234 my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; | |
235 logger('DEBUG', "dry-run: $dry_run"); | |
236 | |
237 # replace existing index files | |
238 my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0; | |
239 logger('DEBUG', "replace: $do_replace"); | |
240 | |
241 # use online mode | |
242 my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; | |
243 logger('DEBUG', "online_mode: $online_mode"); | |
244 | |
245 # use archive mode | |
246 my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; | |
247 logger('DEBUG', "archive_mode: $archive_mode"); | |
248 | |
56 | 249 # create texttool tag (online mode only) |
250 my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; | |
251 logger('DEBUG', "texttool: $texttool"); | |
252 # image dir for texttool | |
253 my $texttool_img_dir = "pages"; | |
254 | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
255 # access type |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
256 my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
257 |
21 | 258 # index.meta namespace (not really implemented!) |
259 my $namespace = ""; | |
260 | |
261 | |
262 my $xml_changed = 0; | |
263 my $errcnt = 0; | |
264 my $warncnt = 0; | |
265 | |
266 ####################################################### | |
267 # check parameters that were passed to the program | |
268 # | |
269 my $infile = $$args{'path'}; | |
270 if (! $infile) { | |
271 logger("ABORT", "no input file given!"); | |
272 exit 1; | |
273 } | |
274 # strip double slashes | |
275 $infile =~ s/\/\//\//; | |
276 if (! -f $infile) { | |
277 logger("ABORT", "input file \'$infile\' doesn't exist!"); | |
278 exit 1; | |
279 } | |
280 | |
281 | |
282 ####################################################### | |
283 # subroutines | |
284 # | |
285 | |
286 | |
287 sub find_arch_dir { | |
288 my ($input_node) = @_; | |
289 my $dir = ""; | |
290 | |
291 my $bib_id = $input_node->findvalue("fm:$arch_id_field"); | |
292 #logger('DEBUG', "bibdir: $bib_dir"); | |
293 if ($bib_id) { | |
294 $dir = "$lib_arch_dir/lit$bib_id"; | |
295 if (-d $dir) { | |
296 logger('DEBUG', "directory $dir exists"); | |
297 return $dir; | |
298 } | |
299 } | |
300 return; | |
301 } | |
302 | |
303 sub find_permanent_dir { | |
304 my ($input_node) = @_; | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
305 my $online_base = $lib_online_dir; |
21 | 306 my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
307 if (! $dest_id) { | |
308 logger('ERROR', "no ID field for online permanent entry"); | |
309 $errcnt++; | |
310 return; | |
311 } | |
312 my $dir = "$online_base/lit$dest_id"; | |
56 | 313 if (-d $dir) { |
314 logger('DEBUG', "directory $dir exists"); | |
315 return $dir; | |
316 } | |
317 return; | |
21 | 318 } |
319 | |
320 | |
321 sub convert_bib { | |
322 my ($input_node, $index_root, $index_doc) = @_; | |
323 my $cnt = 0; | |
324 my $type = ""; | |
325 my $type_path = ""; | |
326 | |
327 # process general stuff first | |
328 foreach my $n ($input_node->getChildNodes()) { | |
329 my $name = $n->nodeName(); | |
330 my $val = $n->textContent(); | |
331 #logger('DEBUG', " NODE: $name = '$val'"); | |
332 if (exists $gen_map{$name}) { | |
333 # is a general field | |
334 if ($name eq $lang_field) { | |
335 # language field | |
336 if (not $val) { | |
337 logger('WARNING', "no language tag"); | |
338 $warncnt++; | |
339 next; | |
340 } | |
341 # convert to iso code | |
342 if (exists $lang_map{$val}) { | |
343 $val = $lang_map{$val}; | |
344 } else { | |
345 logger('ERROR', "unknown language: $val! skipping..."); | |
346 $errcnt++; | |
347 return 0; | |
348 } | |
349 } | |
350 create_element_path($gen_map{$name}, $index_root, $namespace) | |
351 ->appendTextNode($val); | |
352 $cnt++; | |
353 } elsif (exists $type_map{$name}) { | |
354 # is a type field | |
355 $type_path = $type_map{$name}; | |
356 $type = $val; | |
357 # check with known types | |
358 if (exists $subtype_map{$val}) { | |
359 my $indextype = $subtype_map{$val}->{'_name'}; | |
360 create_element_path("$type_path=$indextype", $index_root, $namespace); | |
361 $cnt++; | |
362 } else { | |
363 logger('ERROR', "unknown bib type $val! skipping..."); | |
364 $errcnt++; | |
365 return 0; | |
366 } | |
367 } | |
368 } | |
369 # process sub type fields | |
370 if ($type) { | |
371 foreach my $n ($input_node->getChildNodes()) { | |
372 my $name = $n->nodeName(); | |
373 my $val = $n->textContent(); | |
374 #logger('DEBUG', " NODE: $name = '$val'"); | |
375 if (exists $subtype_map{$type}->{$name}) { | |
376 create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) | |
377 ->appendTextNode($val); | |
378 $cnt++; | |
379 } | |
380 } | |
31 | 381 # append additional constant fields (beginning with #) |
382 foreach my $k (keys %{$subtype_map{$type}}) { | |
383 if ($k =~ /^\#(.*)/) { | |
384 my $val = $1; | |
385 create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); | |
386 } | |
387 } | |
21 | 388 } |
389 return $cnt; | |
390 } | |
391 | |
392 | |
393 | |
394 sub process_all_fm_entries { | |
395 my ($input_root) = @_; | |
396 my $cnt = 0; | |
397 | |
398 foreach my $n ($input_root->findnodes('fm:ROW')) { | |
399 logger('INFO', "processing entry $cnt ..."); | |
400 process_fm_entry($n); | |
401 $cnt++; | |
402 } | |
403 } | |
404 | |
405 | |
406 sub process_fm_entry { | |
407 my ($input_node) = @_; | |
408 my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); | |
409 my $index_root = $index_doc->createElementNS($namespace, 'resource'); | |
410 $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); | |
411 $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); | |
412 $index_doc->setDocumentElement($index_root); | |
413 | |
414 # try to find the document directory | |
415 my $doc_dir = ""; | |
416 if ($online_mode) { | |
417 $doc_dir = find_permanent_dir($input_node); | |
418 } elsif ($archive_mode) { | |
419 $doc_dir = find_arch_dir($input_node); | |
420 } else { | |
421 $doc_dir = find_permanent_dir($input_node); | |
422 } | |
423 if (! $doc_dir) { | |
424 logger('ERROR', "document directory not found! skipping..."); | |
425 $errcnt++; | |
426 return; | |
427 } | |
428 | |
429 # check if index.meta exists | |
430 if ( -f "$doc_dir/index.meta") { | |
431 if (not $do_replace) { | |
432 logger('DEBUG', "index file in $doc_dir exists"); | |
433 return; | |
434 } | |
435 } | |
436 | |
437 # add standard stuff to index.meta | |
438 my ($docname, $docpath) = split_file_path($doc_dir); | |
439 # name and date | |
440 create_text_path('name', $docname, $index_root, $namespace); | |
441 create_text_path('archive-path', $doc_dir, $index_root, $namespace); | |
442 create_text_path('archive-creation-date', stime(time), $index_root, $namespace); | |
443 create_text_path('creator', 'vlp', $index_root, $namespace); | |
444 create_text_path('description', 'a scanned document', $index_root, $namespace); | |
445 if ($archive_mode) { | |
446 # acquisition | |
447 create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); | |
448 create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); | |
449 create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); | |
450 } | |
451 # media | |
452 create_text_path('media-type', 'image', $index_root, $namespace); | |
453 create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
454 # access |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
455 if ($access_type) { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
456 if ($access_type eq "free") { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
457 create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
458 } else { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
459 my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
460 create_text_path('name', $access_type, $acc_tag, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
461 } |
56 | 462 } elsif ($online_mode) { |
463 # read access conditions from "online" field in DB dump | |
464 my $online = sstrip($input_node->findvalue("fm:$access_free_field")); | |
465 if ($online) { | |
466 create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); | |
467 } else { | |
468 my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); | |
469 create_text_path('name', 'mpiwg', $acc_tag, $namespace); | |
470 } | |
471 } | |
472 | |
473 # texttool tag with image dir | |
474 if ($online_mode && $texttool) { | |
475 if ( -d "$doc_dir/$texttool_img_dir" ) { | |
476 create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); | |
477 } else { | |
478 logger('WARNING', "page image directory missing!"); | |
479 $warncnt++; | |
480 } | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
481 } |
21 | 482 |
483 # convert bib entries | |
484 my $cnt = convert_bib($input_node, $index_root, $index_doc); | |
485 if ($cnt == 0) { | |
486 # error or nothing to convert | |
487 logger('ERROR', "no bibliographic metadata!"); | |
488 $errcnt++; | |
489 return; | |
490 } | |
491 | |
492 # write new index.meta file | |
493 if ($dry_run) { | |
494 logger('DEBUG', "would write $doc_dir/index.meta"); | |
495 logger('DEBUG', $index_doc->toString(1)); | |
496 } else { | |
497 write_xml($index_doc, "$doc_dir/index.meta"); | |
498 } | |
499 | |
500 } | |
501 | |
502 | |
503 | |
504 | |
505 | |
506 ####################################################### | |
507 # Main | |
508 # | |
509 | |
510 # load filemaker xml dump | |
511 my ($input_doc, $input_root) = read_xml($infile); | |
512 # set namespace prefix | |
513 my $fm_namespace = $input_root->namespaceURI(); | |
514 $input_root->setNamespace($fm_namespace, 'fm', 1); | |
515 | |
516 | |
517 process_all_fm_entries($input_root); | |
518 | |
519 | |
520 logger("INFO", "$warncnt warnings"); | |
521 logger("INFO", "$errcnt errors"); | |
522 if ($errcnt > 0) { | |
523 logger("ABORT", "there were errors!"); | |
524 exit 1; | |
525 } else { | |
526 logger("DONE", "done something successfully!"); | |
527 } | |
528 |