Mercurial > hg > foxridge-archiver
annotate makemeta-vlp.pl @ 44:af4323868086
added production-comment, derived-from and access handling
author | casties |
---|---|
date | Mon, 11 Dec 2006 19:08:24 +0100 |
parents | 7bf843ac256b |
children | 173e9823761e |
rev | line source |
---|---|
21 | 1 #!/usr/local/bin/perl -w |
2 | |
3 use strict; | |
4 use XML::LibXML; | |
5 | |
22 | 6 use lib '/usr/local/mpiwg/archive'; |
21 | 7 use MPIWGStor; |
8 | |
9 # make output unbuffered | |
10 $|=1; | |
11 | |
12 # program version | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
13 my $version = "0.2.4 (5.12.2006 ROC)"; |
21 | 14 my $help = |
15 "use: makemeta-vlp [options] file.xml | |
16 options: | |
17 -debug show debugging info | |
18 -dry-run simulate, dont'do anything | |
19 -replace replace existing index files | |
20 -online-mode mode for creating online/permanent files | |
21 -archive-mode mode for creating archive/data files | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
22 -access=free adds free access tag for online-mode |
21 | 23 "; |
24 logger("INFO", "makemeta-vlp $version"); | |
25 | |
26 ########################################### | |
27 # mappings | |
28 | |
29 # generic mappings at top level | |
30 my %gen_map = ( | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
31 'Custom2_Language' => 'meta/lang', |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
32 'ProductionComment' => 'meta/image-acquisition/production-comment', |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
33 'derivedFrom' => 'derived-from/archive-path' |
21 | 34 ); |
35 # sub type switch tag | |
36 my %type_map = ( | |
37 'ReferenceType' => 'meta/bib@type' | |
38 ); | |
39 # sub type mappings | |
40 my %subtype_map = ( | |
41 'Book' => { | |
42 '_name' => 'book', | |
43 'Author' => 'meta/bib/author', | |
44 'Title' => 'meta/bib/title', | |
45 'Year' => 'meta/bib/year', | |
46 'Place_Published' => 'meta/bib/city', | |
47 'Publisher' => 'meta/bib/publisher', | |
48 'Edition' => 'meta/bib/edition', | |
49 'Volume' => 'meta/bib/volume', | |
50 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
51 'Pages' => 'meta/bib/number-of-pages' | |
52 }, | |
31 | 53 '(Book)' => { |
54 '_name' => 'book', | |
55 'Author' => 'meta/bib/author', | |
56 'Title' => 'meta/bib/title', | |
57 'Year' => 'meta/bib/year', | |
58 'Place_Published' => 'meta/bib/city', | |
59 'Publisher' => 'meta/bib/publisher', | |
60 'Edition' => 'meta/bib/edition', | |
61 'Volume' => 'meta/bib/volume', | |
62 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
63 'Pages' => 'meta/bib/number-of-pages', | |
64 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
65 }, | |
21 | 66 'Book Section' => { |
67 '_name' => 'inbook', | |
68 'Author' => 'meta/bib/author', | |
69 'Title' => 'meta/bib/title', | |
70 'Year' => 'meta/bib/year', | |
31 | 71 'SecondaryTitle' => 'meta/bib/book-title', |
21 | 72 'SecondaryAuthor' => 'meta/bib/editor', |
73 'Volume' => 'meta/bib/volume', | |
74 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
75 'Pages' => 'meta/bib/pages' | |
76 }, | |
77 'Edited Book' => { | |
78 '_name' => 'edited-book', | |
79 'Author' => 'meta/bib/editor', | |
80 'Title' => 'meta/bib/title', | |
81 'Year' => 'meta/bib/year', | |
82 'Place_Published' => 'meta/bib/city', | |
83 'Publisher' => 'meta/bib/publisher', | |
84 'Edition' => 'meta/bib/edition', | |
85 'Volume' => 'meta/bib/volume', | |
86 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
33 | 87 'Pages' => 'meta/bib/number-of-pages', |
88 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
89 }, | |
90 '(Edited Book)' => { | |
91 '_name' => 'edited-book', | |
92 'Author' => 'meta/bib/editor', | |
93 'Title' => 'meta/bib/title', | |
94 'Year' => 'meta/bib/year', | |
95 'Place_Published' => 'meta/bib/city', | |
96 'Publisher' => 'meta/bib/publisher', | |
97 'Edition' => 'meta/bib/edition', | |
98 'Volume' => 'meta/bib/volume', | |
99 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
21 | 100 'Pages' => 'meta/bib/number-of-pages' |
101 }, | |
102 'Journal Article' => { | |
103 '_name' => 'journal-article', | |
104 'Author' => 'meta/bib/author', | |
105 'Title' => 'meta/bib/title', | |
106 'Year' => 'meta/bib/year', | |
107 'SecondaryTitle' => 'meta/bib/journal', | |
108 'Volume' => 'meta/bib/volume', | |
109 'Number_Issue' => 'meta/bib/issue', | |
110 'Pages' => 'meta/bib/pages' | |
111 }, | |
38 | 112 '(JournalVolume)' => { |
113 '_name' => 'journal-volume', | |
114 'SecondaryTitle' => 'meta/bib/title', | |
115 'SecondaryAuthor' => 'meta/bib/editor', | |
116 'Publisher' => 'meta/bib/publisher', | |
117 'Place_Published' => 'meta/bib/city', | |
118 'Year' => 'meta/bib/year', | |
119 'Volume' => 'meta/bib/volume', | |
120 'Pages' => 'meta/bib/number-of-pages', | |
121 '#Cover pages only, articles have been extracted' => 'meta/bib/comment' | |
122 }, | |
21 | 123 'Magazine Article' => { |
124 '_name' => 'magazine-article', | |
125 'Author' => 'meta/bib/author', | |
126 'Title' => 'meta/bib/title', | |
127 'Year' => 'meta/bib/year', | |
128 'Secondary_Title' => 'meta/bib/magazine', | |
129 'Number_Issue' => 'meta/bib/issue-number', | |
130 'Date' => 'meta/bib/issue-date', | |
131 'Pages' => 'meta/bib/pages' | |
132 }, | |
133 'Report' => { | |
134 '_name' => 'report', | |
135 'Author' => 'meta/bib/author', | |
136 'Title' => 'meta/bib/title', | |
137 'Year' => 'meta/bib/year', | |
138 'Place_Published' => 'meta/bib/city', | |
139 'Date' => 'meta/bib/date', | |
140 'SecondaryTitle' => 'meta/bib/type', | |
141 'Pages' => 'meta/bib/pages' | |
142 }, | |
143 'Trade Catalogue' => { | |
144 '_name' => 'report', | |
145 'Author' => 'meta/bib/author', | |
146 'Title' => 'meta/bib/title', | |
147 'Year' => 'meta/bib/year', | |
148 'Place_Published' => 'meta/bib/city', | |
149 'Date' => 'meta/bib/date', | |
150 'Volume' => 'meta/bib/volume', | |
151 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
152 'ReferenceType' => 'meta/bib/type', | |
153 'Pages' => 'meta/bib/pages' | |
154 }, | |
155 'Thesis' => { | |
156 '_name' => 'thesis', | |
157 'Author' => 'meta/bib/author', | |
158 'Title' => 'meta/bib/title', | |
159 'Place_Published' => 'meta/bib/city', | |
160 'Publisher' => 'meta/bib/university', | |
161 'Date' => 'meta/bib/date', | |
162 'TypeOfWork' => 'meta/bib/type', | |
163 'Pages' => 'meta/bib/number-of-pages' | |
164 }, | |
165 'Manuscript' => { | |
166 '_name' => 'manuscript', | |
167 'Author' => 'meta/bib/author', | |
168 'Title' => 'meta/bib/title', | |
169 'Year' => 'meta/bib/year', | |
170 'Place_Published' => 'meta/bib/location', | |
171 'Pages' => 'meta/bib/pages' | |
172 } | |
173 ); | |
174 # language element | |
175 my $lang_field = 'Custom2_Language'; | |
176 # languages to iso codes | |
177 my %lang_map = ( | |
178 'German' => 'de', | |
179 'English' => 'en', | |
180 'Italian' => 'it', | |
181 'French' => 'fr', | |
182 'Latin' => 'la', | |
183 'Japanese' => 'ja', | |
184 'Dutch' => 'nl', | |
185 'Spanish' => 'es', | |
186 'Swedish' => 'sv' | |
187 ); | |
188 # storage fields | |
189 my $arch_id_field = 'ID'; | |
190 | |
191 ####################################################### | |
192 # internal parameters | |
193 # | |
194 | |
195 # storage | |
196 my $lib_arch_dir = '/mpiwg/archive/data/vlp'; | |
197 my $lib_online_dir = '/mpiwg/online/permanent/vlp'; | |
198 | |
199 # read command line parameters | |
200 my $args = MPIWGStor::parseargs; | |
201 if (! scalar(%$args)) { | |
202 print $help, "\n"; | |
203 exit 1; | |
204 } | |
205 | |
206 # debug level | |
207 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; | |
208 | |
209 # simulate action only | |
210 my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; | |
211 logger('DEBUG', "dry-run: $dry_run"); | |
212 | |
213 # replace existing index files | |
214 my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0; | |
215 logger('DEBUG', "replace: $do_replace"); | |
216 | |
217 # use online mode | |
218 my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; | |
219 logger('DEBUG', "online_mode: $online_mode"); | |
220 | |
221 # use archive mode | |
222 my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; | |
223 logger('DEBUG', "archive_mode: $archive_mode"); | |
224 | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
225 # access type |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
226 my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
227 |
21 | 228 # index.meta namespace (not really implemented!) |
229 my $namespace = ""; | |
230 | |
231 | |
232 my $xml_changed = 0; | |
233 my $errcnt = 0; | |
234 my $warncnt = 0; | |
235 | |
236 ####################################################### | |
237 # check parameters that were passed to the program | |
238 # | |
239 my $infile = $$args{'path'}; | |
240 if (! $infile) { | |
241 logger("ABORT", "no input file given!"); | |
242 exit 1; | |
243 } | |
244 # strip double slashes | |
245 $infile =~ s/\/\//\//; | |
246 if (! -f $infile) { | |
247 logger("ABORT", "input file \'$infile\' doesn't exist!"); | |
248 exit 1; | |
249 } | |
250 | |
251 | |
252 ####################################################### | |
253 # subroutines | |
254 # | |
255 | |
256 | |
257 sub find_arch_dir { | |
258 my ($input_node) = @_; | |
259 my $dir = ""; | |
260 | |
261 my $bib_id = $input_node->findvalue("fm:$arch_id_field"); | |
262 #logger('DEBUG', "bibdir: $bib_dir"); | |
263 if ($bib_id) { | |
264 $dir = "$lib_arch_dir/lit$bib_id"; | |
265 if (-d $dir) { | |
266 logger('DEBUG', "directory $dir exists"); | |
267 return $dir; | |
268 } | |
269 } | |
270 return; | |
271 } | |
272 | |
273 sub find_permanent_dir { | |
274 my ($input_node) = @_; | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
275 my $online_base = $lib_online_dir; |
21 | 276 my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
277 if (! $dest_id) { | |
278 logger('ERROR', "no ID field for online permanent entry"); | |
279 $errcnt++; | |
280 return; | |
281 } | |
282 my $dir = "$online_base/lit$dest_id"; | |
283 return $dir; | |
284 } | |
285 | |
286 | |
287 sub convert_bib { | |
288 my ($input_node, $index_root, $index_doc) = @_; | |
289 my $cnt = 0; | |
290 my $type = ""; | |
291 my $type_path = ""; | |
292 | |
293 # process general stuff first | |
294 foreach my $n ($input_node->getChildNodes()) { | |
295 my $name = $n->nodeName(); | |
296 my $val = $n->textContent(); | |
297 #logger('DEBUG', " NODE: $name = '$val'"); | |
298 if (exists $gen_map{$name}) { | |
299 # is a general field | |
300 if ($name eq $lang_field) { | |
301 # language field | |
302 if (not $val) { | |
303 logger('WARNING', "no language tag"); | |
304 $warncnt++; | |
305 next; | |
306 } | |
307 # convert to iso code | |
308 if (exists $lang_map{$val}) { | |
309 $val = $lang_map{$val}; | |
310 } else { | |
311 logger('ERROR', "unknown language: $val! skipping..."); | |
312 $errcnt++; | |
313 return 0; | |
314 } | |
315 } | |
316 create_element_path($gen_map{$name}, $index_root, $namespace) | |
317 ->appendTextNode($val); | |
318 $cnt++; | |
319 } elsif (exists $type_map{$name}) { | |
320 # is a type field | |
321 $type_path = $type_map{$name}; | |
322 $type = $val; | |
323 # check with known types | |
324 if (exists $subtype_map{$val}) { | |
325 my $indextype = $subtype_map{$val}->{'_name'}; | |
326 create_element_path("$type_path=$indextype", $index_root, $namespace); | |
327 $cnt++; | |
328 } else { | |
329 logger('ERROR', "unknown bib type $val! skipping..."); | |
330 $errcnt++; | |
331 return 0; | |
332 } | |
333 } | |
334 } | |
335 # process sub type fields | |
336 if ($type) { | |
337 foreach my $n ($input_node->getChildNodes()) { | |
338 my $name = $n->nodeName(); | |
339 my $val = $n->textContent(); | |
340 #logger('DEBUG', " NODE: $name = '$val'"); | |
341 if (exists $subtype_map{$type}->{$name}) { | |
342 create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) | |
343 ->appendTextNode($val); | |
344 $cnt++; | |
345 } | |
346 } | |
31 | 347 # append additional constant fields (beginning with #) |
348 foreach my $k (keys %{$subtype_map{$type}}) { | |
349 if ($k =~ /^\#(.*)/) { | |
350 my $val = $1; | |
351 create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); | |
352 } | |
353 } | |
21 | 354 } |
355 return $cnt; | |
356 } | |
357 | |
358 | |
359 | |
360 sub process_all_fm_entries { | |
361 my ($input_root) = @_; | |
362 my $cnt = 0; | |
363 | |
364 foreach my $n ($input_root->findnodes('fm:ROW')) { | |
365 logger('INFO', "processing entry $cnt ..."); | |
366 process_fm_entry($n); | |
367 $cnt++; | |
368 } | |
369 } | |
370 | |
371 | |
372 sub process_fm_entry { | |
373 my ($input_node) = @_; | |
374 my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); | |
375 my $index_root = $index_doc->createElementNS($namespace, 'resource'); | |
376 $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); | |
377 $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); | |
378 $index_doc->setDocumentElement($index_root); | |
379 | |
380 # try to find the document directory | |
381 my $doc_dir = ""; | |
382 if ($online_mode) { | |
383 $doc_dir = find_permanent_dir($input_node); | |
384 } elsif ($archive_mode) { | |
385 $doc_dir = find_arch_dir($input_node); | |
386 } else { | |
387 $doc_dir = find_permanent_dir($input_node); | |
388 } | |
389 if (! $doc_dir) { | |
390 logger('ERROR', "document directory not found! skipping..."); | |
391 $errcnt++; | |
392 return; | |
393 } | |
394 | |
395 # check if index.meta exists | |
396 if ( -f "$doc_dir/index.meta") { | |
397 if (not $do_replace) { | |
398 logger('DEBUG', "index file in $doc_dir exists"); | |
399 return; | |
400 } | |
401 } | |
402 | |
403 # add standard stuff to index.meta | |
404 my ($docname, $docpath) = split_file_path($doc_dir); | |
405 # name and date | |
406 create_text_path('name', $docname, $index_root, $namespace); | |
407 create_text_path('archive-path', $doc_dir, $index_root, $namespace); | |
408 create_text_path('archive-creation-date', stime(time), $index_root, $namespace); | |
409 create_text_path('creator', 'vlp', $index_root, $namespace); | |
410 create_text_path('description', 'a scanned document', $index_root, $namespace); | |
411 if ($archive_mode) { | |
412 # acquisition | |
413 create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); | |
414 create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); | |
415 create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); | |
416 } | |
417 # media | |
418 create_text_path('media-type', 'image', $index_root, $namespace); | |
419 create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
420 # access |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
421 if ($access_type) { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
422 if ($access_type eq "free") { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
423 create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
424 } else { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
425 my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
426 create_text_path('name', $access_type, $acc_tag, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
427 } |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
428 } |
21 | 429 |
430 # convert bib entries | |
431 my $cnt = convert_bib($input_node, $index_root, $index_doc); | |
432 if ($cnt == 0) { | |
433 # error or nothing to convert | |
434 logger('ERROR', "no bibliographic metadata!"); | |
435 $errcnt++; | |
436 return; | |
437 } | |
438 | |
439 # write new index.meta file | |
440 if ($dry_run) { | |
441 logger('DEBUG', "would write $doc_dir/index.meta"); | |
442 logger('DEBUG', $index_doc->toString(1)); | |
443 } else { | |
444 write_xml($index_doc, "$doc_dir/index.meta"); | |
445 } | |
446 | |
447 } | |
448 | |
449 | |
450 | |
451 | |
452 | |
453 ####################################################### | |
454 # Main | |
455 # | |
456 | |
457 # load filemaker xml dump | |
458 my ($input_doc, $input_root) = read_xml($infile); | |
459 # set namespace prefix | |
460 my $fm_namespace = $input_root->namespaceURI(); | |
461 $input_root->setNamespace($fm_namespace, 'fm', 1); | |
462 | |
463 | |
464 process_all_fm_entries($input_root); | |
465 | |
466 | |
467 logger("INFO", "$warncnt warnings"); | |
468 logger("INFO", "$errcnt errors"); | |
469 if ($errcnt > 0) { | |
470 logger("ABORT", "there were errors!"); | |
471 exit 1; | |
472 } else { | |
473 logger("DONE", "done something successfully!"); | |
474 } | |
475 |