Mercurial > hg > foxridge-archiver
annotate makemeta-vlp.pl @ 54:8e19bc5ca86a
added some more languages
author | casties |
---|---|
date | Mon, 02 Feb 2009 12:45:15 +0100 |
parents | 173e9823761e |
children | 9d7df218f94c |
rev | line source |
---|---|
21 | 1 #!/usr/local/bin/perl -w |
2 | |
3 use strict; | |
4 use XML::LibXML; | |
5 | |
22 | 6 use lib '/usr/local/mpiwg/archive'; |
21 | 7 use MPIWGStor; |
8 | |
9 # make output unbuffered | |
10 $|=1; | |
11 | |
12 # program version | |
54 | 13 my $version = "0.2.6 (1.2.2009 ROC)"; |
21 | 14 my $help = |
15 "use: makemeta-vlp [options] file.xml | |
16 options: | |
17 -debug show debugging info | |
18 -dry-run simulate, dont'do anything | |
19 -replace replace existing index files | |
20 -online-mode mode for creating online/permanent files | |
21 -archive-mode mode for creating archive/data files | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
22 -access=free adds free access tag for online-mode |
21 | 23 "; |
24 logger("INFO", "makemeta-vlp $version"); | |
25 | |
26 ########################################### | |
27 # mappings | |
28 | |
29 # generic mappings at top level | |
30 my %gen_map = ( | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
31 'Custom2_Language' => 'meta/lang', |
48 | 32 'productionComment' => 'meta/image-acquisition/production-comment', |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
33 'derivedFrom' => 'derived-from/archive-path' |
21 | 34 ); |
35 # sub type switch tag | |
36 my %type_map = ( | |
37 'ReferenceType' => 'meta/bib@type' | |
38 ); | |
39 # sub type mappings | |
40 my %subtype_map = ( | |
41 'Book' => { | |
42 '_name' => 'book', | |
43 'Author' => 'meta/bib/author', | |
44 'Title' => 'meta/bib/title', | |
45 'Year' => 'meta/bib/year', | |
46 'Place_Published' => 'meta/bib/city', | |
47 'Publisher' => 'meta/bib/publisher', | |
48 'Edition' => 'meta/bib/edition', | |
49 'Volume' => 'meta/bib/volume', | |
50 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
51 'Pages' => 'meta/bib/number-of-pages' | |
52 }, | |
31 | 53 '(Book)' => { |
54 '_name' => 'book', | |
55 'Author' => 'meta/bib/author', | |
56 'Title' => 'meta/bib/title', | |
57 'Year' => 'meta/bib/year', | |
58 'Place_Published' => 'meta/bib/city', | |
59 'Publisher' => 'meta/bib/publisher', | |
60 'Edition' => 'meta/bib/edition', | |
61 'Volume' => 'meta/bib/volume', | |
62 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
63 'Pages' => 'meta/bib/number-of-pages', | |
64 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
65 }, | |
21 | 66 'Book Section' => { |
67 '_name' => 'inbook', | |
68 'Author' => 'meta/bib/author', | |
69 'Title' => 'meta/bib/title', | |
70 'Year' => 'meta/bib/year', | |
31 | 71 'SecondaryTitle' => 'meta/bib/book-title', |
21 | 72 'SecondaryAuthor' => 'meta/bib/editor', |
73 'Volume' => 'meta/bib/volume', | |
74 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
75 'Pages' => 'meta/bib/pages' | |
76 }, | |
77 'Edited Book' => { | |
78 '_name' => 'edited-book', | |
79 'Author' => 'meta/bib/editor', | |
80 'Title' => 'meta/bib/title', | |
81 'Year' => 'meta/bib/year', | |
82 'Place_Published' => 'meta/bib/city', | |
83 'Publisher' => 'meta/bib/publisher', | |
84 'Edition' => 'meta/bib/edition', | |
85 'Volume' => 'meta/bib/volume', | |
86 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
33 | 87 'Pages' => 'meta/bib/number-of-pages', |
88 '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' | |
89 }, | |
90 '(Edited Book)' => { | |
91 '_name' => 'edited-book', | |
92 'Author' => 'meta/bib/editor', | |
93 'Title' => 'meta/bib/title', | |
94 'Year' => 'meta/bib/year', | |
95 'Place_Published' => 'meta/bib/city', | |
96 'Publisher' => 'meta/bib/publisher', | |
97 'Edition' => 'meta/bib/edition', | |
98 'Volume' => 'meta/bib/volume', | |
99 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
21 | 100 'Pages' => 'meta/bib/number-of-pages' |
101 }, | |
102 'Journal Article' => { | |
103 '_name' => 'journal-article', | |
104 'Author' => 'meta/bib/author', | |
105 'Title' => 'meta/bib/title', | |
106 'Year' => 'meta/bib/year', | |
107 'SecondaryTitle' => 'meta/bib/journal', | |
108 'Volume' => 'meta/bib/volume', | |
109 'Number_Issue' => 'meta/bib/issue', | |
110 'Pages' => 'meta/bib/pages' | |
111 }, | |
38 | 112 '(JournalVolume)' => { |
113 '_name' => 'journal-volume', | |
114 'SecondaryTitle' => 'meta/bib/title', | |
115 'SecondaryAuthor' => 'meta/bib/editor', | |
116 'Publisher' => 'meta/bib/publisher', | |
117 'Place_Published' => 'meta/bib/city', | |
118 'Year' => 'meta/bib/year', | |
119 'Volume' => 'meta/bib/volume', | |
120 'Pages' => 'meta/bib/number-of-pages', | |
121 '#Cover pages only, articles have been extracted' => 'meta/bib/comment' | |
122 }, | |
21 | 123 'Magazine Article' => { |
124 '_name' => 'magazine-article', | |
125 'Author' => 'meta/bib/author', | |
126 'Title' => 'meta/bib/title', | |
127 'Year' => 'meta/bib/year', | |
128 'Secondary_Title' => 'meta/bib/magazine', | |
129 'Number_Issue' => 'meta/bib/issue-number', | |
130 'Date' => 'meta/bib/issue-date', | |
131 'Pages' => 'meta/bib/pages' | |
132 }, | |
133 'Report' => { | |
134 '_name' => 'report', | |
135 'Author' => 'meta/bib/author', | |
136 'Title' => 'meta/bib/title', | |
137 'Year' => 'meta/bib/year', | |
138 'Place_Published' => 'meta/bib/city', | |
139 'Date' => 'meta/bib/date', | |
140 'SecondaryTitle' => 'meta/bib/type', | |
141 'Pages' => 'meta/bib/pages' | |
142 }, | |
143 'Trade Catalogue' => { | |
144 '_name' => 'report', | |
145 'Author' => 'meta/bib/author', | |
146 'Title' => 'meta/bib/title', | |
147 'Year' => 'meta/bib/year', | |
148 'Place_Published' => 'meta/bib/city', | |
149 'Date' => 'meta/bib/date', | |
150 'Volume' => 'meta/bib/volume', | |
151 'NumberOfVolumes' => 'meta/bib/number-of-volumes', | |
152 'ReferenceType' => 'meta/bib/type', | |
153 'Pages' => 'meta/bib/pages' | |
154 }, | |
155 'Thesis' => { | |
156 '_name' => 'thesis', | |
157 'Author' => 'meta/bib/author', | |
158 'Title' => 'meta/bib/title', | |
159 'Place_Published' => 'meta/bib/city', | |
160 'Publisher' => 'meta/bib/university', | |
161 'Date' => 'meta/bib/date', | |
162 'TypeOfWork' => 'meta/bib/type', | |
163 'Pages' => 'meta/bib/number-of-pages' | |
164 }, | |
165 'Manuscript' => { | |
166 '_name' => 'manuscript', | |
167 'Author' => 'meta/bib/author', | |
168 'Title' => 'meta/bib/title', | |
169 'Year' => 'meta/bib/year', | |
170 'Place_Published' => 'meta/bib/location', | |
171 'Pages' => 'meta/bib/pages' | |
172 } | |
173 ); | |
174 # language element | |
175 my $lang_field = 'Custom2_Language'; | |
176 # languages to iso codes | |
177 my %lang_map = ( | |
178 'German' => 'de', | |
179 'English' => 'en', | |
180 'Italian' => 'it', | |
181 'French' => 'fr', | |
182 'Latin' => 'la', | |
183 'Japanese' => 'ja', | |
184 'Dutch' => 'nl', | |
185 'Spanish' => 'es', | |
54 | 186 'Swedish' => 'sv', |
187 'Russian' => 'ru', | |
188 'Polish' => 'pl', | |
189 'Greek' => 'el' | |
21 | 190 ); |
191 # storage fields | |
192 my $arch_id_field = 'ID'; | |
193 | |
194 ####################################################### | |
195 # internal parameters | |
196 # | |
197 | |
198 # storage | |
199 my $lib_arch_dir = '/mpiwg/archive/data/vlp'; | |
200 my $lib_online_dir = '/mpiwg/online/permanent/vlp'; | |
201 | |
202 # read command line parameters | |
203 my $args = MPIWGStor::parseargs; | |
204 if (! scalar(%$args)) { | |
205 print $help, "\n"; | |
206 exit 1; | |
207 } | |
208 | |
209 # debug level | |
210 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; | |
211 | |
212 # simulate action only | |
213 my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; | |
214 logger('DEBUG', "dry-run: $dry_run"); | |
215 | |
216 # replace existing index files | |
217 my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0; | |
218 logger('DEBUG', "replace: $do_replace"); | |
219 | |
220 # use online mode | |
221 my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; | |
222 logger('DEBUG', "online_mode: $online_mode"); | |
223 | |
224 # use archive mode | |
225 my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; | |
226 logger('DEBUG', "archive_mode: $archive_mode"); | |
227 | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
228 # access type |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
229 my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
230 |
21 | 231 # index.meta namespace (not really implemented!) |
232 my $namespace = ""; | |
233 | |
234 | |
235 my $xml_changed = 0; | |
236 my $errcnt = 0; | |
237 my $warncnt = 0; | |
238 | |
239 ####################################################### | |
240 # check parameters that were passed to the program | |
241 # | |
242 my $infile = $$args{'path'}; | |
243 if (! $infile) { | |
244 logger("ABORT", "no input file given!"); | |
245 exit 1; | |
246 } | |
247 # strip double slashes | |
248 $infile =~ s/\/\//\//; | |
249 if (! -f $infile) { | |
250 logger("ABORT", "input file \'$infile\' doesn't exist!"); | |
251 exit 1; | |
252 } | |
253 | |
254 | |
255 ####################################################### | |
256 # subroutines | |
257 # | |
258 | |
259 | |
260 sub find_arch_dir { | |
261 my ($input_node) = @_; | |
262 my $dir = ""; | |
263 | |
264 my $bib_id = $input_node->findvalue("fm:$arch_id_field"); | |
265 #logger('DEBUG', "bibdir: $bib_dir"); | |
266 if ($bib_id) { | |
267 $dir = "$lib_arch_dir/lit$bib_id"; | |
268 if (-d $dir) { | |
269 logger('DEBUG', "directory $dir exists"); | |
270 return $dir; | |
271 } | |
272 } | |
273 return; | |
274 } | |
275 | |
276 sub find_permanent_dir { | |
277 my ($input_node) = @_; | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
278 my $online_base = $lib_online_dir; |
21 | 279 my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
280 if (! $dest_id) { | |
281 logger('ERROR', "no ID field for online permanent entry"); | |
282 $errcnt++; | |
283 return; | |
284 } | |
285 my $dir = "$online_base/lit$dest_id"; | |
286 return $dir; | |
287 } | |
288 | |
289 | |
290 sub convert_bib { | |
291 my ($input_node, $index_root, $index_doc) = @_; | |
292 my $cnt = 0; | |
293 my $type = ""; | |
294 my $type_path = ""; | |
295 | |
296 # process general stuff first | |
297 foreach my $n ($input_node->getChildNodes()) { | |
298 my $name = $n->nodeName(); | |
299 my $val = $n->textContent(); | |
300 #logger('DEBUG', " NODE: $name = '$val'"); | |
301 if (exists $gen_map{$name}) { | |
302 # is a general field | |
303 if ($name eq $lang_field) { | |
304 # language field | |
305 if (not $val) { | |
306 logger('WARNING', "no language tag"); | |
307 $warncnt++; | |
308 next; | |
309 } | |
310 # convert to iso code | |
311 if (exists $lang_map{$val}) { | |
312 $val = $lang_map{$val}; | |
313 } else { | |
314 logger('ERROR', "unknown language: $val! skipping..."); | |
315 $errcnt++; | |
316 return 0; | |
317 } | |
318 } | |
319 create_element_path($gen_map{$name}, $index_root, $namespace) | |
320 ->appendTextNode($val); | |
321 $cnt++; | |
322 } elsif (exists $type_map{$name}) { | |
323 # is a type field | |
324 $type_path = $type_map{$name}; | |
325 $type = $val; | |
326 # check with known types | |
327 if (exists $subtype_map{$val}) { | |
328 my $indextype = $subtype_map{$val}->{'_name'}; | |
329 create_element_path("$type_path=$indextype", $index_root, $namespace); | |
330 $cnt++; | |
331 } else { | |
332 logger('ERROR', "unknown bib type $val! skipping..."); | |
333 $errcnt++; | |
334 return 0; | |
335 } | |
336 } | |
337 } | |
338 # process sub type fields | |
339 if ($type) { | |
340 foreach my $n ($input_node->getChildNodes()) { | |
341 my $name = $n->nodeName(); | |
342 my $val = $n->textContent(); | |
343 #logger('DEBUG', " NODE: $name = '$val'"); | |
344 if (exists $subtype_map{$type}->{$name}) { | |
345 create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) | |
346 ->appendTextNode($val); | |
347 $cnt++; | |
348 } | |
349 } | |
31 | 350 # append additional constant fields (beginning with #) |
351 foreach my $k (keys %{$subtype_map{$type}}) { | |
352 if ($k =~ /^\#(.*)/) { | |
353 my $val = $1; | |
354 create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); | |
355 } | |
356 } | |
21 | 357 } |
358 return $cnt; | |
359 } | |
360 | |
361 | |
362 | |
363 sub process_all_fm_entries { | |
364 my ($input_root) = @_; | |
365 my $cnt = 0; | |
366 | |
367 foreach my $n ($input_root->findnodes('fm:ROW')) { | |
368 logger('INFO', "processing entry $cnt ..."); | |
369 process_fm_entry($n); | |
370 $cnt++; | |
371 } | |
372 } | |
373 | |
374 | |
375 sub process_fm_entry { | |
376 my ($input_node) = @_; | |
377 my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); | |
378 my $index_root = $index_doc->createElementNS($namespace, 'resource'); | |
379 $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); | |
380 $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); | |
381 $index_doc->setDocumentElement($index_root); | |
382 | |
383 # try to find the document directory | |
384 my $doc_dir = ""; | |
385 if ($online_mode) { | |
386 $doc_dir = find_permanent_dir($input_node); | |
387 } elsif ($archive_mode) { | |
388 $doc_dir = find_arch_dir($input_node); | |
389 } else { | |
390 $doc_dir = find_permanent_dir($input_node); | |
391 } | |
392 if (! $doc_dir) { | |
393 logger('ERROR', "document directory not found! skipping..."); | |
394 $errcnt++; | |
395 return; | |
396 } | |
397 | |
398 # check if index.meta exists | |
399 if ( -f "$doc_dir/index.meta") { | |
400 if (not $do_replace) { | |
401 logger('DEBUG', "index file in $doc_dir exists"); | |
402 return; | |
403 } | |
404 } | |
405 | |
406 # add standard stuff to index.meta | |
407 my ($docname, $docpath) = split_file_path($doc_dir); | |
408 # name and date | |
409 create_text_path('name', $docname, $index_root, $namespace); | |
410 create_text_path('archive-path', $doc_dir, $index_root, $namespace); | |
411 create_text_path('archive-creation-date', stime(time), $index_root, $namespace); | |
412 create_text_path('creator', 'vlp', $index_root, $namespace); | |
413 create_text_path('description', 'a scanned document', $index_root, $namespace); | |
414 if ($archive_mode) { | |
415 # acquisition | |
416 create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); | |
417 create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); | |
418 create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); | |
419 } | |
420 # media | |
421 create_text_path('media-type', 'image', $index_root, $namespace); | |
422 create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); | |
44
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
423 # access |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
424 if ($access_type) { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
425 if ($access_type eq "free") { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
426 create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
427 } else { |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
428 my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
429 create_text_path('name', $access_type, $acc_tag, $namespace); |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
430 } |
af4323868086
added production-comment, derived-from and access handling
casties
parents:
38
diff
changeset
|
431 } |
21 | 432 |
433 # convert bib entries | |
434 my $cnt = convert_bib($input_node, $index_root, $index_doc); | |
435 if ($cnt == 0) { | |
436 # error or nothing to convert | |
437 logger('ERROR', "no bibliographic metadata!"); | |
438 $errcnt++; | |
439 return; | |
440 } | |
441 | |
442 # write new index.meta file | |
443 if ($dry_run) { | |
444 logger('DEBUG', "would write $doc_dir/index.meta"); | |
445 logger('DEBUG', $index_doc->toString(1)); | |
446 } else { | |
447 write_xml($index_doc, "$doc_dir/index.meta"); | |
448 } | |
449 | |
450 } | |
451 | |
452 | |
453 | |
454 | |
455 | |
456 ####################################################### | |
457 # Main | |
458 # | |
459 | |
460 # load filemaker xml dump | |
461 my ($input_doc, $input_root) = read_xml($infile); | |
462 # set namespace prefix | |
463 my $fm_namespace = $input_root->namespaceURI(); | |
464 $input_root->setNamespace($fm_namespace, 'fm', 1); | |
465 | |
466 | |
467 process_all_fm_entries($input_root); | |
468 | |
469 | |
470 logger("INFO", "$warncnt warnings"); | |
471 logger("INFO", "$errcnt errors"); | |
472 if ($errcnt > 0) { | |
473 logger("ABORT", "there were errors!"); | |
474 exit 1; | |
475 } else { | |
476 logger("DONE", "done something successfully!"); | |
477 } | |
478 |