Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.7
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.2 casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.7 ! casties 13: my $version = "0.2.5 (14.12.2006 ROC)";
1.1 casties 14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
1.6 casties 22: -access=free adds free access tag for online-mode
1.1 casties 23: ";
24: logger("INFO", "makemeta-vlp $version");
25:
26: ###########################################
27: # mappings
28:
29: # generic mappings at top level
30: my %gen_map = (
1.6 casties 31: 'Custom2_Language' => 'meta/lang',
1.7 ! casties 32: 'productionComment' => 'meta/image-acquisition/production-comment',
1.6 casties 33: 'derivedFrom' => 'derived-from/archive-path'
1.1 casties 34: );
35: # sub type switch tag
36: my %type_map = (
37: 'ReferenceType' => 'meta/bib@type'
38: );
39: # sub type mappings
40: my %subtype_map = (
41: 'Book' => {
42: '_name' => 'book',
43: 'Author' => 'meta/bib/author',
44: 'Title' => 'meta/bib/title',
45: 'Year' => 'meta/bib/year',
46: 'Place_Published' => 'meta/bib/city',
47: 'Publisher' => 'meta/bib/publisher',
48: 'Edition' => 'meta/bib/edition',
49: 'Volume' => 'meta/bib/volume',
50: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
51: 'Pages' => 'meta/bib/number-of-pages'
52: },
1.3 casties 53: '(Book)' => {
54: '_name' => 'book',
55: 'Author' => 'meta/bib/author',
56: 'Title' => 'meta/bib/title',
57: 'Year' => 'meta/bib/year',
58: 'Place_Published' => 'meta/bib/city',
59: 'Publisher' => 'meta/bib/publisher',
60: 'Edition' => 'meta/bib/edition',
61: 'Volume' => 'meta/bib/volume',
62: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
63: 'Pages' => 'meta/bib/number-of-pages',
64: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
65: },
1.1 casties 66: 'Book Section' => {
67: '_name' => 'inbook',
68: 'Author' => 'meta/bib/author',
69: 'Title' => 'meta/bib/title',
70: 'Year' => 'meta/bib/year',
1.3 casties 71: 'SecondaryTitle' => 'meta/bib/book-title',
1.1 casties 72: 'SecondaryAuthor' => 'meta/bib/editor',
73: 'Volume' => 'meta/bib/volume',
74: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
75: 'Pages' => 'meta/bib/pages'
76: },
77: 'Edited Book' => {
78: '_name' => 'edited-book',
79: 'Author' => 'meta/bib/editor',
80: 'Title' => 'meta/bib/title',
81: 'Year' => 'meta/bib/year',
82: 'Place_Published' => 'meta/bib/city',
83: 'Publisher' => 'meta/bib/publisher',
84: 'Edition' => 'meta/bib/edition',
85: 'Volume' => 'meta/bib/volume',
86: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.4 casties 87: 'Pages' => 'meta/bib/number-of-pages',
88: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
89: },
90: '(Edited Book)' => {
91: '_name' => 'edited-book',
92: 'Author' => 'meta/bib/editor',
93: 'Title' => 'meta/bib/title',
94: 'Year' => 'meta/bib/year',
95: 'Place_Published' => 'meta/bib/city',
96: 'Publisher' => 'meta/bib/publisher',
97: 'Edition' => 'meta/bib/edition',
98: 'Volume' => 'meta/bib/volume',
99: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.1 casties 100: 'Pages' => 'meta/bib/number-of-pages'
101: },
102: 'Journal Article' => {
103: '_name' => 'journal-article',
104: 'Author' => 'meta/bib/author',
105: 'Title' => 'meta/bib/title',
106: 'Year' => 'meta/bib/year',
107: 'SecondaryTitle' => 'meta/bib/journal',
108: 'Volume' => 'meta/bib/volume',
109: 'Number_Issue' => 'meta/bib/issue',
110: 'Pages' => 'meta/bib/pages'
111: },
1.5 casties 112: '(JournalVolume)' => {
113: '_name' => 'journal-volume',
114: 'SecondaryTitle' => 'meta/bib/title',
115: 'SecondaryAuthor' => 'meta/bib/editor',
116: 'Publisher' => 'meta/bib/publisher',
117: 'Place_Published' => 'meta/bib/city',
118: 'Year' => 'meta/bib/year',
119: 'Volume' => 'meta/bib/volume',
120: 'Pages' => 'meta/bib/number-of-pages',
121: '#Cover pages only, articles have been extracted' => 'meta/bib/comment'
122: },
1.1 casties 123: 'Magazine Article' => {
124: '_name' => 'magazine-article',
125: 'Author' => 'meta/bib/author',
126: 'Title' => 'meta/bib/title',
127: 'Year' => 'meta/bib/year',
128: 'Secondary_Title' => 'meta/bib/magazine',
129: 'Number_Issue' => 'meta/bib/issue-number',
130: 'Date' => 'meta/bib/issue-date',
131: 'Pages' => 'meta/bib/pages'
132: },
133: 'Report' => {
134: '_name' => 'report',
135: 'Author' => 'meta/bib/author',
136: 'Title' => 'meta/bib/title',
137: 'Year' => 'meta/bib/year',
138: 'Place_Published' => 'meta/bib/city',
139: 'Date' => 'meta/bib/date',
140: 'SecondaryTitle' => 'meta/bib/type',
141: 'Pages' => 'meta/bib/pages'
142: },
143: 'Trade Catalogue' => {
144: '_name' => 'report',
145: 'Author' => 'meta/bib/author',
146: 'Title' => 'meta/bib/title',
147: 'Year' => 'meta/bib/year',
148: 'Place_Published' => 'meta/bib/city',
149: 'Date' => 'meta/bib/date',
150: 'Volume' => 'meta/bib/volume',
151: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
152: 'ReferenceType' => 'meta/bib/type',
153: 'Pages' => 'meta/bib/pages'
154: },
155: 'Thesis' => {
156: '_name' => 'thesis',
157: 'Author' => 'meta/bib/author',
158: 'Title' => 'meta/bib/title',
159: 'Place_Published' => 'meta/bib/city',
160: 'Publisher' => 'meta/bib/university',
161: 'Date' => 'meta/bib/date',
162: 'TypeOfWork' => 'meta/bib/type',
163: 'Pages' => 'meta/bib/number-of-pages'
164: },
165: 'Manuscript' => {
166: '_name' => 'manuscript',
167: 'Author' => 'meta/bib/author',
168: 'Title' => 'meta/bib/title',
169: 'Year' => 'meta/bib/year',
170: 'Place_Published' => 'meta/bib/location',
171: 'Pages' => 'meta/bib/pages'
172: }
173: );
174: # language element
175: my $lang_field = 'Custom2_Language';
176: # languages to iso codes
177: my %lang_map = (
178: 'German' => 'de',
179: 'English' => 'en',
180: 'Italian' => 'it',
181: 'French' => 'fr',
182: 'Latin' => 'la',
183: 'Japanese' => 'ja',
184: 'Dutch' => 'nl',
185: 'Spanish' => 'es',
186: 'Swedish' => 'sv'
187: );
188: # storage fields
189: my $arch_id_field = 'ID';
190:
191: #######################################################
192: # internal parameters
193: #
194:
195: # storage
196: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
197: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
198:
199: # read command line parameters
200: my $args = MPIWGStor::parseargs;
201: if (! scalar(%$args)) {
202: print $help, "\n";
203: exit 1;
204: }
205:
206: # debug level
207: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
208:
209: # simulate action only
210: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
211: logger('DEBUG', "dry-run: $dry_run");
212:
213: # replace existing index files
214: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
215: logger('DEBUG', "replace: $do_replace");
216:
217: # use online mode
218: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
219: logger('DEBUG', "online_mode: $online_mode");
220:
221: # use archive mode
222: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
223: logger('DEBUG', "archive_mode: $archive_mode");
224:
1.6 casties 225: # access type
226: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
227:
1.1 casties 228: # index.meta namespace (not really implemented!)
229: my $namespace = "";
230:
231:
232: my $xml_changed = 0;
233: my $errcnt = 0;
234: my $warncnt = 0;
235:
236: #######################################################
237: # check parameters that were passed to the program
238: #
239: my $infile = $$args{'path'};
240: if (! $infile) {
241: logger("ABORT", "no input file given!");
242: exit 1;
243: }
244: # strip double slashes
245: $infile =~ s/\/\//\//;
246: if (! -f $infile) {
247: logger("ABORT", "input file \'$infile\' doesn't exist!");
248: exit 1;
249: }
250:
251:
252: #######################################################
253: # subroutines
254: #
255:
256:
257: sub find_arch_dir {
258: my ($input_node) = @_;
259: my $dir = "";
260:
261: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
262: #logger('DEBUG', "bibdir: $bib_dir");
263: if ($bib_id) {
264: $dir = "$lib_arch_dir/lit$bib_id";
265: if (-d $dir) {
266: logger('DEBUG', "directory $dir exists");
267: return $dir;
268: }
269: }
270: return;
271: }
272:
273: sub find_permanent_dir {
274: my ($input_node) = @_;
1.6 casties 275: my $online_base = $lib_online_dir;
1.1 casties 276: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
277: if (! $dest_id) {
278: logger('ERROR', "no ID field for online permanent entry");
279: $errcnt++;
280: return;
281: }
282: my $dir = "$online_base/lit$dest_id";
283: return $dir;
284: }
285:
286:
287: sub convert_bib {
288: my ($input_node, $index_root, $index_doc) = @_;
289: my $cnt = 0;
290: my $type = "";
291: my $type_path = "";
292:
293: # process general stuff first
294: foreach my $n ($input_node->getChildNodes()) {
295: my $name = $n->nodeName();
296: my $val = $n->textContent();
297: #logger('DEBUG', " NODE: $name = '$val'");
298: if (exists $gen_map{$name}) {
299: # is a general field
300: if ($name eq $lang_field) {
301: # language field
302: if (not $val) {
303: logger('WARNING', "no language tag");
304: $warncnt++;
305: next;
306: }
307: # convert to iso code
308: if (exists $lang_map{$val}) {
309: $val = $lang_map{$val};
310: } else {
311: logger('ERROR', "unknown language: $val! skipping...");
312: $errcnt++;
313: return 0;
314: }
315: }
316: create_element_path($gen_map{$name}, $index_root, $namespace)
317: ->appendTextNode($val);
318: $cnt++;
319: } elsif (exists $type_map{$name}) {
320: # is a type field
321: $type_path = $type_map{$name};
322: $type = $val;
323: # check with known types
324: if (exists $subtype_map{$val}) {
325: my $indextype = $subtype_map{$val}->{'_name'};
326: create_element_path("$type_path=$indextype", $index_root, $namespace);
327: $cnt++;
328: } else {
329: logger('ERROR', "unknown bib type $val! skipping...");
330: $errcnt++;
331: return 0;
332: }
333: }
334: }
335: # process sub type fields
336: if ($type) {
337: foreach my $n ($input_node->getChildNodes()) {
338: my $name = $n->nodeName();
339: my $val = $n->textContent();
340: #logger('DEBUG', " NODE: $name = '$val'");
341: if (exists $subtype_map{$type}->{$name}) {
342: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
343: ->appendTextNode($val);
344: $cnt++;
345: }
346: }
1.3 casties 347: # append additional constant fields (beginning with #)
348: foreach my $k (keys %{$subtype_map{$type}}) {
349: if ($k =~ /^\#(.*)/) {
350: my $val = $1;
351: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
352: }
353: }
1.1 casties 354: }
355: return $cnt;
356: }
357:
358:
359:
360: sub process_all_fm_entries {
361: my ($input_root) = @_;
362: my $cnt = 0;
363:
364: foreach my $n ($input_root->findnodes('fm:ROW')) {
365: logger('INFO', "processing entry $cnt ...");
366: process_fm_entry($n);
367: $cnt++;
368: }
369: }
370:
371:
372: sub process_fm_entry {
373: my ($input_node) = @_;
374: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
375: my $index_root = $index_doc->createElementNS($namespace, 'resource');
376: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
377: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
378: $index_doc->setDocumentElement($index_root);
379:
380: # try to find the document directory
381: my $doc_dir = "";
382: if ($online_mode) {
383: $doc_dir = find_permanent_dir($input_node);
384: } elsif ($archive_mode) {
385: $doc_dir = find_arch_dir($input_node);
386: } else {
387: $doc_dir = find_permanent_dir($input_node);
388: }
389: if (! $doc_dir) {
390: logger('ERROR', "document directory not found! skipping...");
391: $errcnt++;
392: return;
393: }
394:
395: # check if index.meta exists
396: if ( -f "$doc_dir/index.meta") {
397: if (not $do_replace) {
398: logger('DEBUG', "index file in $doc_dir exists");
399: return;
400: }
401: }
402:
403: # add standard stuff to index.meta
404: my ($docname, $docpath) = split_file_path($doc_dir);
405: # name and date
406: create_text_path('name', $docname, $index_root, $namespace);
407: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
408: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
409: create_text_path('creator', 'vlp', $index_root, $namespace);
410: create_text_path('description', 'a scanned document', $index_root, $namespace);
411: if ($archive_mode) {
412: # acquisition
413: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
414: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
415: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
416: }
417: # media
418: create_text_path('media-type', 'image', $index_root, $namespace);
419: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
1.6 casties 420: # access
421: if ($access_type) {
422: if ($access_type eq "free") {
423: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
424: } else {
425: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
426: create_text_path('name', $access_type, $acc_tag, $namespace);
427: }
428: }
1.1 casties 429:
430: # convert bib entries
431: my $cnt = convert_bib($input_node, $index_root, $index_doc);
432: if ($cnt == 0) {
433: # error or nothing to convert
434: logger('ERROR', "no bibliographic metadata!");
435: $errcnt++;
436: return;
437: }
438:
439: # write new index.meta file
440: if ($dry_run) {
441: logger('DEBUG', "would write $doc_dir/index.meta");
442: logger('DEBUG', $index_doc->toString(1));
443: } else {
444: write_xml($index_doc, "$doc_dir/index.meta");
445: }
446:
447: }
448:
449:
450:
451:
452:
453: #######################################################
454: # Main
455: #
456:
457: # load filemaker xml dump
458: my ($input_doc, $input_root) = read_xml($infile);
459: # set namespace prefix
460: my $fm_namespace = $input_root->namespaceURI();
461: $input_root->setNamespace($fm_namespace, 'fm', 1);
462:
463:
464: process_all_fm_entries($input_root);
465:
466:
467: logger("INFO", "$warncnt warnings");
468: logger("INFO", "$errcnt errors");
469: if ($errcnt > 0) {
470: logger("ABORT", "there were errors!");
471: exit 1;
472: } else {
473: logger("DONE", "done something successfully!");
474: }
475:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>