1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2.7 (27.8.2010 ROC)";
14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
22: -access=free adds free access tag for online-mode
23: -texttool adds texttool tag for online-mode
24: ";
25: logger("INFO", "makemeta-vlp $version");
26:
27: ###########################################
28: # mappings
29:
30: # generic mappings at top level
31: my %gen_map = (
32: 'Custom2_Language' => 'meta/lang',
33: 'productionComment' => 'meta/image-acquisition/production-comment',
34: 'derivedFrom' => 'derived-from/archive-path'
35: );
36: # sub type switch tag
37: my %type_map = (
38: 'ReferenceType' => 'meta/bib@type'
39: );
40: # sub type mappings
41: my %subtype_map = (
42: 'Book' => {
43: '_name' => 'book',
44: 'Author' => 'meta/bib/author',
45: 'Title' => 'meta/bib/title',
46: 'Year' => 'meta/bib/year',
47: 'Place_Published' => 'meta/bib/city',
48: 'Publisher' => 'meta/bib/publisher',
49: 'Edition' => 'meta/bib/edition',
50: 'Volume' => 'meta/bib/volume',
51: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
52: 'Pages' => 'meta/bib/number-of-pages'
53: },
54: '(Book)' => {
55: '_name' => 'book',
56: 'Author' => 'meta/bib/author',
57: 'Title' => 'meta/bib/title',
58: 'Year' => 'meta/bib/year',
59: 'Place_Published' => 'meta/bib/city',
60: 'Publisher' => 'meta/bib/publisher',
61: 'Edition' => 'meta/bib/edition',
62: 'Volume' => 'meta/bib/volume',
63: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
64: 'Pages' => 'meta/bib/number-of-pages',
65: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
66: },
67: 'Book Section' => {
68: '_name' => 'inbook',
69: 'Author' => 'meta/bib/author',
70: 'Title' => 'meta/bib/title',
71: 'Year' => 'meta/bib/year',
72: 'SecondaryTitle' => 'meta/bib/book-title',
73: 'SecondaryAuthor' => 'meta/bib/editor',
74: 'Volume' => 'meta/bib/volume',
75: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
76: 'Pages' => 'meta/bib/pages'
77: },
78: 'Edited Book' => {
79: '_name' => 'edited-book',
80: 'Author' => 'meta/bib/editor',
81: 'Title' => 'meta/bib/title',
82: 'Year' => 'meta/bib/year',
83: 'Place_Published' => 'meta/bib/city',
84: 'Publisher' => 'meta/bib/publisher',
85: 'Edition' => 'meta/bib/edition',
86: 'Volume' => 'meta/bib/volume',
87: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
88: 'Pages' => 'meta/bib/number-of-pages',
89: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
90: },
91: '(Edited Book)' => {
92: '_name' => 'edited-book',
93: 'Author' => 'meta/bib/editor',
94: 'Title' => 'meta/bib/title',
95: 'Year' => 'meta/bib/year',
96: 'Place_Published' => 'meta/bib/city',
97: 'Publisher' => 'meta/bib/publisher',
98: 'Edition' => 'meta/bib/edition',
99: 'Volume' => 'meta/bib/volume',
100: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
101: 'Pages' => 'meta/bib/number-of-pages'
102: },
103: 'Journal Article' => {
104: '_name' => 'journal-article',
105: 'Author' => 'meta/bib/author',
106: 'Title' => 'meta/bib/title',
107: 'Year' => 'meta/bib/year',
108: 'SecondaryTitle' => 'meta/bib/journal',
109: 'Volume' => 'meta/bib/volume',
110: 'Number_Issue' => 'meta/bib/issue',
111: 'Pages' => 'meta/bib/pages'
112: },
113: '(JournalVolume)' => {
114: '_name' => 'journal-volume',
115: 'SecondaryTitle' => 'meta/bib/title',
116: 'SecondaryAuthor' => 'meta/bib/editor',
117: 'Publisher' => 'meta/bib/publisher',
118: 'Place_Published' => 'meta/bib/city',
119: 'Year' => 'meta/bib/year',
120: 'Volume' => 'meta/bib/volume',
121: 'Pages' => 'meta/bib/number-of-pages',
122: '#Cover pages only, articles have been extracted' => 'meta/bib/comment'
123: },
124: 'Journal' => {
125: '_name' => 'report',
126: 'Title' => 'meta/bib/title',
127: 'SecondaryTitle' => 'meta/bib/institution',
128: 'Author' => 'meta/bib/author',
129: 'Place_Published' => 'meta/bib/city',
130: 'Year' => 'meta/bib/year',
131: 'Date' => 'meta/bib/date',
132: 'Pages' => 'meta/bib/pages',
133: },
134: 'Magazine Article' => {
135: '_name' => 'magazine-article',
136: 'Author' => 'meta/bib/author',
137: 'Title' => 'meta/bib/title',
138: 'Year' => 'meta/bib/year',
139: 'Secondary_Title' => 'meta/bib/magazine',
140: 'Number_Issue' => 'meta/bib/issue-number',
141: 'Date' => 'meta/bib/issue-date',
142: 'Pages' => 'meta/bib/pages'
143: },
144: 'Newspaper Article' => {
145: '_name' => 'newspaper-article',
146: 'Author' => 'meta/bib/author',
147: 'Title' => 'meta/bib/title',
148: 'Year' => 'meta/bib/year',
149: 'Secondary_Title' => 'meta/bib/newspaper',
150: 'Date' => 'meta/bib/issue-date',
151: 'Pages' => 'meta/bib/pages'
152: },
153: 'Report' => {
154: '_name' => 'report',
155: 'Author' => 'meta/bib/author',
156: 'Title' => 'meta/bib/title',
157: 'Year' => 'meta/bib/year',
158: 'Place_Published' => 'meta/bib/city',
159: 'Date' => 'meta/bib/date',
160: 'SecondaryTitle' => 'meta/bib/type',
161: 'Pages' => 'meta/bib/pages'
162: },
163: 'Trade Catalogue' => {
164: '_name' => 'report',
165: 'Author' => 'meta/bib/author',
166: 'Title' => 'meta/bib/title',
167: 'Year' => 'meta/bib/year',
168: 'Place_Published' => 'meta/bib/city',
169: 'Date' => 'meta/bib/date',
170: 'Volume' => 'meta/bib/volume',
171: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
172: 'ReferenceType' => 'meta/bib/type',
173: 'Pages' => 'meta/bib/pages'
174: },
175: 'Thesis' => {
176: '_name' => 'thesis',
177: 'Author' => 'meta/bib/author',
178: 'Title' => 'meta/bib/title',
179: 'Place_Published' => 'meta/bib/city',
180: 'Publisher' => 'meta/bib/university',
181: 'Date' => 'meta/bib/date',
182: 'TypeOfWork' => 'meta/bib/type',
183: 'Pages' => 'meta/bib/number-of-pages'
184: },
185: 'Manuscript' => {
186: '_name' => 'manuscript',
187: 'Author' => 'meta/bib/author',
188: 'Title' => 'meta/bib/title',
189: 'Year' => 'meta/bib/year',
190: 'Place_Published' => 'meta/bib/location',
191: 'Pages' => 'meta/bib/pages'
192: }
193: );
194: # language element
195: my $lang_field = 'Custom2_Language';
196: # languages to iso codes
197: my %lang_map = (
198: 'German' => 'de',
199: 'English' => 'en',
200: 'Italian' => 'it',
201: 'French' => 'fr',
202: 'Latin' => 'la',
203: 'Japanese' => 'ja',
204: 'Dutch' => 'nl',
205: 'Spanish' => 'es',
206: 'Swedish' => 'sv',
207: 'Russian' => 'ru',
208: 'Polish' => 'pl',
209: 'Greek' => 'el'
210: );
211: # storage fields
212: my $arch_id_field = 'ID';
213: my $access_free_field = 'online';
214:
215: #######################################################
216: # internal parameters
217: #
218:
219: # storage
220: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
221: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
222:
223: # read command line parameters
224: my $args = MPIWGStor::parseargs;
225: if (! scalar(%$args)) {
226: print $help, "\n";
227: exit 1;
228: }
229:
230: # debug level
231: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
232:
233: # simulate action only
234: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
235: logger('DEBUG', "dry-run: $dry_run");
236:
237: # replace existing index files
238: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
239: logger('DEBUG', "replace: $do_replace");
240:
241: # use online mode
242: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
243: logger('DEBUG', "online_mode: $online_mode");
244:
245: # use archive mode
246: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
247: logger('DEBUG', "archive_mode: $archive_mode");
248:
249: # create texttool tag (online mode only)
250: my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1;
251: logger('DEBUG', "texttool: $texttool");
252: # image dir for texttool
253: my $texttool_img_dir = "pages";
254:
255: # access type
256: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
257:
258: # index.meta namespace (not really implemented!)
259: my $namespace = "";
260:
261:
262: my $xml_changed = 0;
263: my $errcnt = 0;
264: my $warncnt = 0;
265:
266: #######################################################
267: # check parameters that were passed to the program
268: #
269: my $infile = $$args{'path'};
270: if (! $infile) {
271: logger("ABORT", "no input file given!");
272: exit 1;
273: }
274: # strip double slashes
275: $infile =~ s/\/\//\//;
276: if (! -f $infile) {
277: logger("ABORT", "input file \'$infile\' doesn't exist!");
278: exit 1;
279: }
280:
281:
282: #######################################################
283: # subroutines
284: #
285:
286:
287: sub find_arch_dir {
288: my ($input_node) = @_;
289: my $dir = "";
290:
291: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
292: #logger('DEBUG', "bibdir: $bib_dir");
293: if ($bib_id) {
294: $dir = "$lib_arch_dir/lit$bib_id";
295: if (-d $dir) {
296: logger('DEBUG', "directory $dir exists");
297: return $dir;
298: }
299: }
300: return;
301: }
302:
303: sub find_permanent_dir {
304: my ($input_node) = @_;
305: my $online_base = $lib_online_dir;
306: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
307: if (! $dest_id) {
308: logger('ERROR', "no ID field for online permanent entry");
309: $errcnt++;
310: return;
311: }
312: my $dir = "$online_base/lit$dest_id";
313: if (-d $dir) {
314: logger('DEBUG', "directory $dir exists");
315: return $dir;
316: }
317: return;
318: }
319:
320:
321: sub convert_bib {
322: my ($input_node, $index_root, $index_doc) = @_;
323: my $cnt = 0;
324: my $type = "";
325: my $type_path = "";
326:
327: # process general stuff first
328: foreach my $n ($input_node->getChildNodes()) {
329: my $name = $n->nodeName();
330: my $val = $n->textContent();
331: #logger('DEBUG', " NODE: $name = '$val'");
332: if (exists $gen_map{$name}) {
333: # is a general field
334: if ($name eq $lang_field) {
335: # language field
336: if (not $val) {
337: logger('WARNING', "no language tag");
338: $warncnt++;
339: next;
340: }
341: # convert to iso code
342: if (exists $lang_map{$val}) {
343: $val = $lang_map{$val};
344: } else {
345: logger('ERROR', "unknown language: $val! skipping...");
346: $errcnt++;
347: return 0;
348: }
349: }
350: create_element_path($gen_map{$name}, $index_root, $namespace)
351: ->appendTextNode($val);
352: $cnt++;
353: } elsif (exists $type_map{$name}) {
354: # is a type field
355: $type_path = $type_map{$name};
356: $type = $val;
357: # check with known types
358: if (exists $subtype_map{$val}) {
359: my $indextype = $subtype_map{$val}->{'_name'};
360: create_element_path("$type_path=$indextype", $index_root, $namespace);
361: $cnt++;
362: } else {
363: logger('ERROR', "unknown bib type $val! skipping...");
364: $errcnt++;
365: return 0;
366: }
367: }
368: }
369: # process sub type fields
370: if ($type) {
371: foreach my $n ($input_node->getChildNodes()) {
372: my $name = $n->nodeName();
373: my $val = $n->textContent();
374: #logger('DEBUG', " NODE: $name = '$val'");
375: if (exists $subtype_map{$type}->{$name}) {
376: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
377: ->appendTextNode($val);
378: $cnt++;
379: }
380: }
381: # append additional constant fields (beginning with #)
382: foreach my $k (keys %{$subtype_map{$type}}) {
383: if ($k =~ /^\#(.*)/) {
384: my $val = $1;
385: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
386: }
387: }
388: }
389: return $cnt;
390: }
391:
392:
393:
394: sub process_all_fm_entries {
395: my ($input_root) = @_;
396: my $cnt = 0;
397:
398: foreach my $n ($input_root->findnodes('fm:ROW')) {
399: logger('INFO', "processing entry $cnt ...");
400: process_fm_entry($n);
401: $cnt++;
402: }
403: }
404:
405:
406: sub process_fm_entry {
407: my ($input_node) = @_;
408: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
409: my $index_root = $index_doc->createElementNS($namespace, 'resource');
410: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
411: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
412: $index_doc->setDocumentElement($index_root);
413:
414: # try to find the document directory
415: my $doc_dir = "";
416: if ($online_mode) {
417: $doc_dir = find_permanent_dir($input_node);
418: } elsif ($archive_mode) {
419: $doc_dir = find_arch_dir($input_node);
420: } else {
421: $doc_dir = find_permanent_dir($input_node);
422: }
423: if (! $doc_dir) {
424: logger('ERROR', "document directory not found! skipping...");
425: $errcnt++;
426: return;
427: }
428:
429: # check if index.meta exists
430: if ( -f "$doc_dir/index.meta") {
431: if (not $do_replace) {
432: logger('DEBUG', "index file in $doc_dir exists");
433: return;
434: }
435: }
436:
437: # add standard stuff to index.meta
438: my ($docname, $docpath) = split_file_path($doc_dir);
439: # name and date
440: create_text_path('name', $docname, $index_root, $namespace);
441: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
442: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
443: create_text_path('creator', 'vlp', $index_root, $namespace);
444: create_text_path('description', 'a scanned document', $index_root, $namespace);
445: if ($archive_mode) {
446: # acquisition
447: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
448: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
449: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
450: }
451: # media
452: create_text_path('media-type', 'image', $index_root, $namespace);
453: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
454: # access
455: if ($access_type) {
456: if ($access_type eq "free") {
457: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
458: } else {
459: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
460: create_text_path('name', $access_type, $acc_tag, $namespace);
461: }
462: } elsif ($online_mode) {
463: # read access conditions from "online" field in DB dump
464: my $online = sstrip($input_node->findvalue("fm:$access_free_field"));
465: if ($online) {
466: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
467: } else {
468: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
469: create_text_path('name', 'mpiwg', $acc_tag, $namespace);
470: }
471: }
472:
473: # texttool tag with image dir
474: if ($online_mode && $texttool) {
475: if ( -d "$doc_dir/$texttool_img_dir" ) {
476: create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace);
477: } else {
478: logger('WARNING', "page image directory missing!");
479: $warncnt++;
480: }
481: }
482:
483: # convert bib entries
484: my $cnt = convert_bib($input_node, $index_root, $index_doc);
485: if ($cnt == 0) {
486: # error or nothing to convert
487: logger('ERROR', "no bibliographic metadata!");
488: $errcnt++;
489: return;
490: }
491:
492: # write new index.meta file
493: if ($dry_run) {
494: logger('DEBUG', "would write $doc_dir/index.meta");
495: logger('DEBUG', $index_doc->toString(1));
496: } else {
497: write_xml($index_doc, "$doc_dir/index.meta");
498: }
499:
500: }
501:
502:
503:
504:
505:
506: #######################################################
507: # Main
508: #
509:
510: # load filemaker xml dump
511: my ($input_doc, $input_root) = read_xml($infile);
512: # set namespace prefix
513: my $fm_namespace = $input_root->namespaceURI();
514: $input_root->setNamespace($fm_namespace, 'fm', 1);
515:
516:
517: process_all_fm_entries($input_root);
518:
519:
520: logger("INFO", "$warncnt warnings");
521: logger("INFO", "$errcnt errors");
522: if ($errcnt > 0) {
523: logger("ABORT", "there were errors!");
524: exit 1;
525: } else {
526: logger("DONE", "done something successfully!");
527: }
528:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>