Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.8
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.2 casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.8 ! casties 13: my $version = "0.2.6 (1.2.2009 ROC)";
1.1 casties 14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
1.6 casties 22: -access=free adds free access tag for online-mode
1.1 casties 23: ";
24: logger("INFO", "makemeta-vlp $version");
25:
26: ###########################################
27: # mappings
28:
29: # generic mappings at top level
30: my %gen_map = (
1.6 casties 31: 'Custom2_Language' => 'meta/lang',
1.7 casties 32: 'productionComment' => 'meta/image-acquisition/production-comment',
1.6 casties 33: 'derivedFrom' => 'derived-from/archive-path'
1.1 casties 34: );
35: # sub type switch tag
36: my %type_map = (
37: 'ReferenceType' => 'meta/bib@type'
38: );
39: # sub type mappings
40: my %subtype_map = (
41: 'Book' => {
42: '_name' => 'book',
43: 'Author' => 'meta/bib/author',
44: 'Title' => 'meta/bib/title',
45: 'Year' => 'meta/bib/year',
46: 'Place_Published' => 'meta/bib/city',
47: 'Publisher' => 'meta/bib/publisher',
48: 'Edition' => 'meta/bib/edition',
49: 'Volume' => 'meta/bib/volume',
50: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
51: 'Pages' => 'meta/bib/number-of-pages'
52: },
1.3 casties 53: '(Book)' => {
54: '_name' => 'book',
55: 'Author' => 'meta/bib/author',
56: 'Title' => 'meta/bib/title',
57: 'Year' => 'meta/bib/year',
58: 'Place_Published' => 'meta/bib/city',
59: 'Publisher' => 'meta/bib/publisher',
60: 'Edition' => 'meta/bib/edition',
61: 'Volume' => 'meta/bib/volume',
62: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
63: 'Pages' => 'meta/bib/number-of-pages',
64: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
65: },
1.1 casties 66: 'Book Section' => {
67: '_name' => 'inbook',
68: 'Author' => 'meta/bib/author',
69: 'Title' => 'meta/bib/title',
70: 'Year' => 'meta/bib/year',
1.3 casties 71: 'SecondaryTitle' => 'meta/bib/book-title',
1.1 casties 72: 'SecondaryAuthor' => 'meta/bib/editor',
73: 'Volume' => 'meta/bib/volume',
74: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
75: 'Pages' => 'meta/bib/pages'
76: },
77: 'Edited Book' => {
78: '_name' => 'edited-book',
79: 'Author' => 'meta/bib/editor',
80: 'Title' => 'meta/bib/title',
81: 'Year' => 'meta/bib/year',
82: 'Place_Published' => 'meta/bib/city',
83: 'Publisher' => 'meta/bib/publisher',
84: 'Edition' => 'meta/bib/edition',
85: 'Volume' => 'meta/bib/volume',
86: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.4 casties 87: 'Pages' => 'meta/bib/number-of-pages',
88: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
89: },
90: '(Edited Book)' => {
91: '_name' => 'edited-book',
92: 'Author' => 'meta/bib/editor',
93: 'Title' => 'meta/bib/title',
94: 'Year' => 'meta/bib/year',
95: 'Place_Published' => 'meta/bib/city',
96: 'Publisher' => 'meta/bib/publisher',
97: 'Edition' => 'meta/bib/edition',
98: 'Volume' => 'meta/bib/volume',
99: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.1 casties 100: 'Pages' => 'meta/bib/number-of-pages'
101: },
102: 'Journal Article' => {
103: '_name' => 'journal-article',
104: 'Author' => 'meta/bib/author',
105: 'Title' => 'meta/bib/title',
106: 'Year' => 'meta/bib/year',
107: 'SecondaryTitle' => 'meta/bib/journal',
108: 'Volume' => 'meta/bib/volume',
109: 'Number_Issue' => 'meta/bib/issue',
110: 'Pages' => 'meta/bib/pages'
111: },
1.5 casties 112: '(JournalVolume)' => {
113: '_name' => 'journal-volume',
114: 'SecondaryTitle' => 'meta/bib/title',
115: 'SecondaryAuthor' => 'meta/bib/editor',
116: 'Publisher' => 'meta/bib/publisher',
117: 'Place_Published' => 'meta/bib/city',
118: 'Year' => 'meta/bib/year',
119: 'Volume' => 'meta/bib/volume',
120: 'Pages' => 'meta/bib/number-of-pages',
121: '#Cover pages only, articles have been extracted' => 'meta/bib/comment'
122: },
1.1 casties 123: 'Magazine Article' => {
124: '_name' => 'magazine-article',
125: 'Author' => 'meta/bib/author',
126: 'Title' => 'meta/bib/title',
127: 'Year' => 'meta/bib/year',
128: 'Secondary_Title' => 'meta/bib/magazine',
129: 'Number_Issue' => 'meta/bib/issue-number',
130: 'Date' => 'meta/bib/issue-date',
131: 'Pages' => 'meta/bib/pages'
132: },
133: 'Report' => {
134: '_name' => 'report',
135: 'Author' => 'meta/bib/author',
136: 'Title' => 'meta/bib/title',
137: 'Year' => 'meta/bib/year',
138: 'Place_Published' => 'meta/bib/city',
139: 'Date' => 'meta/bib/date',
140: 'SecondaryTitle' => 'meta/bib/type',
141: 'Pages' => 'meta/bib/pages'
142: },
143: 'Trade Catalogue' => {
144: '_name' => 'report',
145: 'Author' => 'meta/bib/author',
146: 'Title' => 'meta/bib/title',
147: 'Year' => 'meta/bib/year',
148: 'Place_Published' => 'meta/bib/city',
149: 'Date' => 'meta/bib/date',
150: 'Volume' => 'meta/bib/volume',
151: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
152: 'ReferenceType' => 'meta/bib/type',
153: 'Pages' => 'meta/bib/pages'
154: },
155: 'Thesis' => {
156: '_name' => 'thesis',
157: 'Author' => 'meta/bib/author',
158: 'Title' => 'meta/bib/title',
159: 'Place_Published' => 'meta/bib/city',
160: 'Publisher' => 'meta/bib/university',
161: 'Date' => 'meta/bib/date',
162: 'TypeOfWork' => 'meta/bib/type',
163: 'Pages' => 'meta/bib/number-of-pages'
164: },
165: 'Manuscript' => {
166: '_name' => 'manuscript',
167: 'Author' => 'meta/bib/author',
168: 'Title' => 'meta/bib/title',
169: 'Year' => 'meta/bib/year',
170: 'Place_Published' => 'meta/bib/location',
171: 'Pages' => 'meta/bib/pages'
172: }
173: );
174: # language element
175: my $lang_field = 'Custom2_Language';
176: # languages to iso codes
177: my %lang_map = (
178: 'German' => 'de',
179: 'English' => 'en',
180: 'Italian' => 'it',
181: 'French' => 'fr',
182: 'Latin' => 'la',
183: 'Japanese' => 'ja',
184: 'Dutch' => 'nl',
185: 'Spanish' => 'es',
1.8 ! casties 186: 'Swedish' => 'sv',
! 187: 'Russian' => 'ru',
! 188: 'Polish' => 'pl',
! 189: 'Greek' => 'el'
1.1 casties 190: );
191: # storage fields
192: my $arch_id_field = 'ID';
193:
194: #######################################################
195: # internal parameters
196: #
197:
198: # storage
199: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
200: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
201:
202: # read command line parameters
203: my $args = MPIWGStor::parseargs;
204: if (! scalar(%$args)) {
205: print $help, "\n";
206: exit 1;
207: }
208:
209: # debug level
210: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
211:
212: # simulate action only
213: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
214: logger('DEBUG', "dry-run: $dry_run");
215:
216: # replace existing index files
217: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
218: logger('DEBUG', "replace: $do_replace");
219:
220: # use online mode
221: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
222: logger('DEBUG', "online_mode: $online_mode");
223:
224: # use archive mode
225: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
226: logger('DEBUG', "archive_mode: $archive_mode");
227:
1.6 casties 228: # access type
229: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
230:
1.1 casties 231: # index.meta namespace (not really implemented!)
232: my $namespace = "";
233:
234:
235: my $xml_changed = 0;
236: my $errcnt = 0;
237: my $warncnt = 0;
238:
239: #######################################################
240: # check parameters that were passed to the program
241: #
242: my $infile = $$args{'path'};
243: if (! $infile) {
244: logger("ABORT", "no input file given!");
245: exit 1;
246: }
247: # strip double slashes
248: $infile =~ s/\/\//\//;
249: if (! -f $infile) {
250: logger("ABORT", "input file \'$infile\' doesn't exist!");
251: exit 1;
252: }
253:
254:
255: #######################################################
256: # subroutines
257: #
258:
259:
260: sub find_arch_dir {
261: my ($input_node) = @_;
262: my $dir = "";
263:
264: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
265: #logger('DEBUG', "bibdir: $bib_dir");
266: if ($bib_id) {
267: $dir = "$lib_arch_dir/lit$bib_id";
268: if (-d $dir) {
269: logger('DEBUG', "directory $dir exists");
270: return $dir;
271: }
272: }
273: return;
274: }
275:
276: sub find_permanent_dir {
277: my ($input_node) = @_;
1.6 casties 278: my $online_base = $lib_online_dir;
1.1 casties 279: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
280: if (! $dest_id) {
281: logger('ERROR', "no ID field for online permanent entry");
282: $errcnt++;
283: return;
284: }
285: my $dir = "$online_base/lit$dest_id";
286: return $dir;
287: }
288:
289:
290: sub convert_bib {
291: my ($input_node, $index_root, $index_doc) = @_;
292: my $cnt = 0;
293: my $type = "";
294: my $type_path = "";
295:
296: # process general stuff first
297: foreach my $n ($input_node->getChildNodes()) {
298: my $name = $n->nodeName();
299: my $val = $n->textContent();
300: #logger('DEBUG', " NODE: $name = '$val'");
301: if (exists $gen_map{$name}) {
302: # is a general field
303: if ($name eq $lang_field) {
304: # language field
305: if (not $val) {
306: logger('WARNING', "no language tag");
307: $warncnt++;
308: next;
309: }
310: # convert to iso code
311: if (exists $lang_map{$val}) {
312: $val = $lang_map{$val};
313: } else {
314: logger('ERROR', "unknown language: $val! skipping...");
315: $errcnt++;
316: return 0;
317: }
318: }
319: create_element_path($gen_map{$name}, $index_root, $namespace)
320: ->appendTextNode($val);
321: $cnt++;
322: } elsif (exists $type_map{$name}) {
323: # is a type field
324: $type_path = $type_map{$name};
325: $type = $val;
326: # check with known types
327: if (exists $subtype_map{$val}) {
328: my $indextype = $subtype_map{$val}->{'_name'};
329: create_element_path("$type_path=$indextype", $index_root, $namespace);
330: $cnt++;
331: } else {
332: logger('ERROR', "unknown bib type $val! skipping...");
333: $errcnt++;
334: return 0;
335: }
336: }
337: }
338: # process sub type fields
339: if ($type) {
340: foreach my $n ($input_node->getChildNodes()) {
341: my $name = $n->nodeName();
342: my $val = $n->textContent();
343: #logger('DEBUG', " NODE: $name = '$val'");
344: if (exists $subtype_map{$type}->{$name}) {
345: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
346: ->appendTextNode($val);
347: $cnt++;
348: }
349: }
1.3 casties 350: # append additional constant fields (beginning with #)
351: foreach my $k (keys %{$subtype_map{$type}}) {
352: if ($k =~ /^\#(.*)/) {
353: my $val = $1;
354: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
355: }
356: }
1.1 casties 357: }
358: return $cnt;
359: }
360:
361:
362:
363: sub process_all_fm_entries {
364: my ($input_root) = @_;
365: my $cnt = 0;
366:
367: foreach my $n ($input_root->findnodes('fm:ROW')) {
368: logger('INFO', "processing entry $cnt ...");
369: process_fm_entry($n);
370: $cnt++;
371: }
372: }
373:
374:
375: sub process_fm_entry {
376: my ($input_node) = @_;
377: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
378: my $index_root = $index_doc->createElementNS($namespace, 'resource');
379: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
380: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
381: $index_doc->setDocumentElement($index_root);
382:
383: # try to find the document directory
384: my $doc_dir = "";
385: if ($online_mode) {
386: $doc_dir = find_permanent_dir($input_node);
387: } elsif ($archive_mode) {
388: $doc_dir = find_arch_dir($input_node);
389: } else {
390: $doc_dir = find_permanent_dir($input_node);
391: }
392: if (! $doc_dir) {
393: logger('ERROR', "document directory not found! skipping...");
394: $errcnt++;
395: return;
396: }
397:
398: # check if index.meta exists
399: if ( -f "$doc_dir/index.meta") {
400: if (not $do_replace) {
401: logger('DEBUG', "index file in $doc_dir exists");
402: return;
403: }
404: }
405:
406: # add standard stuff to index.meta
407: my ($docname, $docpath) = split_file_path($doc_dir);
408: # name and date
409: create_text_path('name', $docname, $index_root, $namespace);
410: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
411: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
412: create_text_path('creator', 'vlp', $index_root, $namespace);
413: create_text_path('description', 'a scanned document', $index_root, $namespace);
414: if ($archive_mode) {
415: # acquisition
416: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
417: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
418: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
419: }
420: # media
421: create_text_path('media-type', 'image', $index_root, $namespace);
422: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
1.6 casties 423: # access
424: if ($access_type) {
425: if ($access_type eq "free") {
426: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
427: } else {
428: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
429: create_text_path('name', $access_type, $acc_tag, $namespace);
430: }
431: }
1.1 casties 432:
433: # convert bib entries
434: my $cnt = convert_bib($input_node, $index_root, $index_doc);
435: if ($cnt == 0) {
436: # error or nothing to convert
437: logger('ERROR', "no bibliographic metadata!");
438: $errcnt++;
439: return;
440: }
441:
442: # write new index.meta file
443: if ($dry_run) {
444: logger('DEBUG', "would write $doc_dir/index.meta");
445: logger('DEBUG', $index_doc->toString(1));
446: } else {
447: write_xml($index_doc, "$doc_dir/index.meta");
448: }
449:
450: }
451:
452:
453:
454:
455:
456: #######################################################
457: # Main
458: #
459:
460: # load filemaker xml dump
461: my ($input_doc, $input_root) = read_xml($infile);
462: # set namespace prefix
463: my $fm_namespace = $input_root->namespaceURI();
464: $input_root->setNamespace($fm_namespace, 'fm', 1);
465:
466:
467: process_all_fm_entries($input_root);
468:
469:
470: logger("INFO", "$warncnt warnings");
471: logger("INFO", "$errcnt errors");
472: if ($errcnt > 0) {
473: logger("ABORT", "there were errors!");
474: exit 1;
475: } else {
476: logger("DONE", "done something successfully!");
477: }
478:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>