1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2.3 (27.6.2006 ROC)";
14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
22: ";
23: logger("INFO", "makemeta-vlp $version");
24:
25: ###########################################
26: # mappings
27:
28: # generic mappings at top level
29: my %gen_map = (
30: 'Custom2_Language' => 'meta/lang'
31: );
32: # sub type switch tag
33: my %type_map = (
34: 'ReferenceType' => 'meta/bib@type'
35: );
36: # sub type mappings
37: my %subtype_map = (
38: 'Book' => {
39: '_name' => 'book',
40: 'Author' => 'meta/bib/author',
41: 'Title' => 'meta/bib/title',
42: 'Year' => 'meta/bib/year',
43: 'Place_Published' => 'meta/bib/city',
44: 'Publisher' => 'meta/bib/publisher',
45: 'Edition' => 'meta/bib/edition',
46: 'Volume' => 'meta/bib/volume',
47: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
48: 'Pages' => 'meta/bib/number-of-pages'
49: },
50: '(Book)' => {
51: '_name' => 'book',
52: 'Author' => 'meta/bib/author',
53: 'Title' => 'meta/bib/title',
54: 'Year' => 'meta/bib/year',
55: 'Place_Published' => 'meta/bib/city',
56: 'Publisher' => 'meta/bib/publisher',
57: 'Edition' => 'meta/bib/edition',
58: 'Volume' => 'meta/bib/volume',
59: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
60: 'Pages' => 'meta/bib/number-of-pages',
61: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
62: },
63: 'Book Section' => {
64: '_name' => 'inbook',
65: 'Author' => 'meta/bib/author',
66: 'Title' => 'meta/bib/title',
67: 'Year' => 'meta/bib/year',
68: 'SecondaryTitle' => 'meta/bib/book-title',
69: 'SecondaryAuthor' => 'meta/bib/editor',
70: 'Volume' => 'meta/bib/volume',
71: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
72: 'Pages' => 'meta/bib/pages'
73: },
74: 'Edited Book' => {
75: '_name' => 'edited-book',
76: 'Author' => 'meta/bib/editor',
77: 'Title' => 'meta/bib/title',
78: 'Year' => 'meta/bib/year',
79: 'Place_Published' => 'meta/bib/city',
80: 'Publisher' => 'meta/bib/publisher',
81: 'Edition' => 'meta/bib/edition',
82: 'Volume' => 'meta/bib/volume',
83: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
84: 'Pages' => 'meta/bib/number-of-pages',
85: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
86: },
87: '(Edited Book)' => {
88: '_name' => 'edited-book',
89: 'Author' => 'meta/bib/editor',
90: 'Title' => 'meta/bib/title',
91: 'Year' => 'meta/bib/year',
92: 'Place_Published' => 'meta/bib/city',
93: 'Publisher' => 'meta/bib/publisher',
94: 'Edition' => 'meta/bib/edition',
95: 'Volume' => 'meta/bib/volume',
96: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
97: 'Pages' => 'meta/bib/number-of-pages'
98: },
99: 'Journal Article' => {
100: '_name' => 'journal-article',
101: 'Author' => 'meta/bib/author',
102: 'Title' => 'meta/bib/title',
103: 'Year' => 'meta/bib/year',
104: 'SecondaryTitle' => 'meta/bib/journal',
105: 'Volume' => 'meta/bib/volume',
106: 'Number_Issue' => 'meta/bib/issue',
107: 'Pages' => 'meta/bib/pages'
108: },
109: '(JournalVolume)' => {
110: '_name' => 'journal-volume',
111: 'SecondaryTitle' => 'meta/bib/title',
112: 'SecondaryAuthor' => 'meta/bib/editor',
113: 'Publisher' => 'meta/bib/publisher',
114: 'Place_Published' => 'meta/bib/city',
115: 'Year' => 'meta/bib/year',
116: 'Volume' => 'meta/bib/volume',
117: 'Pages' => 'meta/bib/number-of-pages',
118: '#Cover pages only, articles have been extracted' => 'meta/bib/comment'
119: },
120: 'Magazine Article' => {
121: '_name' => 'magazine-article',
122: 'Author' => 'meta/bib/author',
123: 'Title' => 'meta/bib/title',
124: 'Year' => 'meta/bib/year',
125: 'Secondary_Title' => 'meta/bib/magazine',
126: 'Number_Issue' => 'meta/bib/issue-number',
127: 'Date' => 'meta/bib/issue-date',
128: 'Pages' => 'meta/bib/pages'
129: },
130: 'Report' => {
131: '_name' => 'report',
132: 'Author' => 'meta/bib/author',
133: 'Title' => 'meta/bib/title',
134: 'Year' => 'meta/bib/year',
135: 'Place_Published' => 'meta/bib/city',
136: 'Date' => 'meta/bib/date',
137: 'SecondaryTitle' => 'meta/bib/type',
138: 'Pages' => 'meta/bib/pages'
139: },
140: 'Trade Catalogue' => {
141: '_name' => 'report',
142: 'Author' => 'meta/bib/author',
143: 'Title' => 'meta/bib/title',
144: 'Year' => 'meta/bib/year',
145: 'Place_Published' => 'meta/bib/city',
146: 'Date' => 'meta/bib/date',
147: 'Volume' => 'meta/bib/volume',
148: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
149: 'ReferenceType' => 'meta/bib/type',
150: 'Pages' => 'meta/bib/pages'
151: },
152: 'Thesis' => {
153: '_name' => 'thesis',
154: 'Author' => 'meta/bib/author',
155: 'Title' => 'meta/bib/title',
156: 'Place_Published' => 'meta/bib/city',
157: 'Publisher' => 'meta/bib/university',
158: 'Date' => 'meta/bib/date',
159: 'TypeOfWork' => 'meta/bib/type',
160: 'Pages' => 'meta/bib/number-of-pages'
161: },
162: 'Manuscript' => {
163: '_name' => 'manuscript',
164: 'Author' => 'meta/bib/author',
165: 'Title' => 'meta/bib/title',
166: 'Year' => 'meta/bib/year',
167: 'Place_Published' => 'meta/bib/location',
168: 'Pages' => 'meta/bib/pages'
169: }
170: );
171: # language element
172: my $lang_field = 'Custom2_Language';
173: # languages to iso codes
174: my %lang_map = (
175: 'German' => 'de',
176: 'English' => 'en',
177: 'Italian' => 'it',
178: 'French' => 'fr',
179: 'Latin' => 'la',
180: 'Japanese' => 'ja',
181: 'Dutch' => 'nl',
182: 'Spanish' => 'es',
183: 'Swedish' => 'sv'
184: );
185: # storage fields
186: my $arch_id_field = 'ID';
187:
188: #######################################################
189: # internal parameters
190: #
191:
192: # storage
193: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
194: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
195:
196: # read command line parameters
197: my $args = MPIWGStor::parseargs;
198: if (! scalar(%$args)) {
199: print $help, "\n";
200: exit 1;
201: }
202:
203: # debug level
204: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
205:
206: # simulate action only
207: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
208: logger('DEBUG', "dry-run: $dry_run");
209:
210: # replace existing index files
211: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
212: logger('DEBUG', "replace: $do_replace");
213:
214: # use online mode
215: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
216: logger('DEBUG', "online_mode: $online_mode");
217:
218: # use archive mode
219: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
220: logger('DEBUG', "archive_mode: $archive_mode");
221:
222: # index.meta namespace (not really implemented!)
223: my $namespace = "";
224:
225:
226: my $xml_changed = 0;
227: my $errcnt = 0;
228: my $warncnt = 0;
229:
230: #######################################################
231: # check parameters that were passed to the program
232: #
233: my $infile = $$args{'path'};
234: if (! $infile) {
235: logger("ABORT", "no input file given!");
236: exit 1;
237: }
238: # strip double slashes
239: $infile =~ s/\/\//\//;
240: if (! -f $infile) {
241: logger("ABORT", "input file \'$infile\' doesn't exist!");
242: exit 1;
243: }
244:
245:
246: #######################################################
247: # subroutines
248: #
249:
250:
251: sub find_arch_dir {
252: my ($input_node) = @_;
253: my $dir = "";
254:
255: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
256: #logger('DEBUG', "bibdir: $bib_dir");
257: if ($bib_id) {
258: $dir = "$lib_arch_dir/lit$bib_id";
259: if (-d $dir) {
260: logger('DEBUG', "directory $dir exists");
261: return $dir;
262: }
263: }
264: return;
265: }
266:
267: sub find_permanent_dir {
268: my ($input_node) = @_;
269: my $online_base = '/mpiwg/online/permanent';
270: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
271: if (! $dest_id) {
272: logger('ERROR', "no ID field for online permanent entry");
273: $errcnt++;
274: return;
275: }
276: my $dir = "$online_base/lit$dest_id";
277: return $dir;
278: }
279:
280:
281: sub convert_bib {
282: my ($input_node, $index_root, $index_doc) = @_;
283: my $cnt = 0;
284: my $type = "";
285: my $type_path = "";
286:
287: # process general stuff first
288: foreach my $n ($input_node->getChildNodes()) {
289: my $name = $n->nodeName();
290: my $val = $n->textContent();
291: #logger('DEBUG', " NODE: $name = '$val'");
292: if (exists $gen_map{$name}) {
293: # is a general field
294: if ($name eq $lang_field) {
295: # language field
296: if (not $val) {
297: logger('WARNING', "no language tag");
298: $warncnt++;
299: next;
300: }
301: # convert to iso code
302: if (exists $lang_map{$val}) {
303: $val = $lang_map{$val};
304: } else {
305: logger('ERROR', "unknown language: $val! skipping...");
306: $errcnt++;
307: return 0;
308: }
309: }
310: create_element_path($gen_map{$name}, $index_root, $namespace)
311: ->appendTextNode($val);
312: $cnt++;
313: } elsif (exists $type_map{$name}) {
314: # is a type field
315: $type_path = $type_map{$name};
316: $type = $val;
317: # check with known types
318: if (exists $subtype_map{$val}) {
319: my $indextype = $subtype_map{$val}->{'_name'};
320: create_element_path("$type_path=$indextype", $index_root, $namespace);
321: $cnt++;
322: } else {
323: logger('ERROR', "unknown bib type $val! skipping...");
324: $errcnt++;
325: return 0;
326: }
327: }
328: }
329: # process sub type fields
330: if ($type) {
331: foreach my $n ($input_node->getChildNodes()) {
332: my $name = $n->nodeName();
333: my $val = $n->textContent();
334: #logger('DEBUG', " NODE: $name = '$val'");
335: if (exists $subtype_map{$type}->{$name}) {
336: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
337: ->appendTextNode($val);
338: $cnt++;
339: }
340: }
341: # append additional constant fields (beginning with #)
342: foreach my $k (keys %{$subtype_map{$type}}) {
343: if ($k =~ /^\#(.*)/) {
344: my $val = $1;
345: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
346: }
347: }
348: }
349: return $cnt;
350: }
351:
352:
353:
354: sub process_all_fm_entries {
355: my ($input_root) = @_;
356: my $cnt = 0;
357:
358: foreach my $n ($input_root->findnodes('fm:ROW')) {
359: logger('INFO', "processing entry $cnt ...");
360: process_fm_entry($n);
361: $cnt++;
362: }
363: }
364:
365:
366: sub process_fm_entry {
367: my ($input_node) = @_;
368: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
369: my $index_root = $index_doc->createElementNS($namespace, 'resource');
370: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
371: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
372: $index_doc->setDocumentElement($index_root);
373:
374: # try to find the document directory
375: my $doc_dir = "";
376: if ($online_mode) {
377: $doc_dir = find_permanent_dir($input_node);
378: } elsif ($archive_mode) {
379: $doc_dir = find_arch_dir($input_node);
380: } else {
381: $doc_dir = find_permanent_dir($input_node);
382: }
383: if (! $doc_dir) {
384: logger('ERROR', "document directory not found! skipping...");
385: $errcnt++;
386: return;
387: }
388:
389: # check if index.meta exists
390: if ( -f "$doc_dir/index.meta") {
391: if (not $do_replace) {
392: logger('DEBUG', "index file in $doc_dir exists");
393: return;
394: }
395: }
396:
397: # add standard stuff to index.meta
398: my ($docname, $docpath) = split_file_path($doc_dir);
399: # name and date
400: create_text_path('name', $docname, $index_root, $namespace);
401: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
402: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
403: create_text_path('creator', 'vlp', $index_root, $namespace);
404: create_text_path('description', 'a scanned document', $index_root, $namespace);
405: if ($archive_mode) {
406: # acquisition
407: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
408: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
409: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
410: # image acquisition
411: create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
412: create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
413: create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
414: }
415: # media
416: create_text_path('media-type', 'image', $index_root, $namespace);
417: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
418:
419: # convert bib entries
420: my $cnt = convert_bib($input_node, $index_root, $index_doc);
421: if ($cnt == 0) {
422: # error or nothing to convert
423: logger('ERROR', "no bibliographic metadata!");
424: $errcnt++;
425: return;
426: }
427:
428: # write new index.meta file
429: if ($dry_run) {
430: logger('DEBUG', "would write $doc_dir/index.meta");
431: logger('DEBUG', $index_doc->toString(1));
432: } else {
433: write_xml($index_doc, "$doc_dir/index.meta");
434: }
435:
436: }
437:
438:
439:
440:
441:
442: #######################################################
443: # Main
444: #
445:
446: # load filemaker xml dump
447: my ($input_doc, $input_root) = read_xml($infile);
448: # set namespace prefix
449: my $fm_namespace = $input_root->namespaceURI();
450: $input_root->setNamespace($fm_namespace, 'fm', 1);
451:
452:
453: process_all_fm_entries($input_root);
454:
455:
456: logger("INFO", "$warncnt warnings");
457: logger("INFO", "$errcnt errors");
458: if ($errcnt > 0) {
459: logger("ABORT", "there were errors!");
460: exit 1;
461: } else {
462: logger("DONE", "done something successfully!");
463: }
464:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>