Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.4
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.2 casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.4 ! casties 13: my $version = "0.2.2 (20.6.2006 ROC)";
1.1 casties 14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
22: ";
23: logger("INFO", "makemeta-vlp $version");
24:
25: ###########################################
26: # mappings
27:
28: # generic mappings at top level
29: my %gen_map = (
30: 'Custom2_Language' => 'meta/lang'
31: );
32: # sub type switch tag
33: my %type_map = (
34: 'ReferenceType' => 'meta/bib@type'
35: );
36: # sub type mappings
37: my %subtype_map = (
38: 'Book' => {
39: '_name' => 'book',
40: 'Author' => 'meta/bib/author',
41: 'Title' => 'meta/bib/title',
42: 'Year' => 'meta/bib/year',
43: 'Place_Published' => 'meta/bib/city',
44: 'Publisher' => 'meta/bib/publisher',
45: 'Edition' => 'meta/bib/edition',
46: 'Volume' => 'meta/bib/volume',
47: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
48: 'Pages' => 'meta/bib/number-of-pages'
49: },
1.3 casties 50: '(Book)' => {
51: '_name' => 'book',
52: 'Author' => 'meta/bib/author',
53: 'Title' => 'meta/bib/title',
54: 'Year' => 'meta/bib/year',
55: 'Place_Published' => 'meta/bib/city',
56: 'Publisher' => 'meta/bib/publisher',
57: 'Edition' => 'meta/bib/edition',
58: 'Volume' => 'meta/bib/volume',
59: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
60: 'Pages' => 'meta/bib/number-of-pages',
61: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
62: },
1.1 casties 63: 'Book Section' => {
64: '_name' => 'inbook',
65: 'Author' => 'meta/bib/author',
66: 'Title' => 'meta/bib/title',
67: 'Year' => 'meta/bib/year',
1.3 casties 68: 'SecondaryTitle' => 'meta/bib/book-title',
1.1 casties 69: 'SecondaryAuthor' => 'meta/bib/editor',
70: 'Volume' => 'meta/bib/volume',
71: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
72: 'Pages' => 'meta/bib/pages'
73: },
74: 'Edited Book' => {
75: '_name' => 'edited-book',
76: 'Author' => 'meta/bib/editor',
77: 'Title' => 'meta/bib/title',
78: 'Year' => 'meta/bib/year',
79: 'Place_Published' => 'meta/bib/city',
80: 'Publisher' => 'meta/bib/publisher',
81: 'Edition' => 'meta/bib/edition',
82: 'Volume' => 'meta/bib/volume',
83: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.4 ! casties 84: 'Pages' => 'meta/bib/number-of-pages',
! 85: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
! 86: },
! 87: '(Edited Book)' => {
! 88: '_name' => 'edited-book',
! 89: 'Author' => 'meta/bib/editor',
! 90: 'Title' => 'meta/bib/title',
! 91: 'Year' => 'meta/bib/year',
! 92: 'Place_Published' => 'meta/bib/city',
! 93: 'Publisher' => 'meta/bib/publisher',
! 94: 'Edition' => 'meta/bib/edition',
! 95: 'Volume' => 'meta/bib/volume',
! 96: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.1 casties 97: 'Pages' => 'meta/bib/number-of-pages'
98: },
99: 'Journal Article' => {
100: '_name' => 'journal-article',
101: 'Author' => 'meta/bib/author',
102: 'Title' => 'meta/bib/title',
103: 'Year' => 'meta/bib/year',
104: 'SecondaryTitle' => 'meta/bib/journal',
105: 'Volume' => 'meta/bib/volume',
106: 'Number_Issue' => 'meta/bib/issue',
107: 'Pages' => 'meta/bib/pages'
108: },
109: 'Magazine Article' => {
110: '_name' => 'magazine-article',
111: 'Author' => 'meta/bib/author',
112: 'Title' => 'meta/bib/title',
113: 'Year' => 'meta/bib/year',
114: 'Secondary_Title' => 'meta/bib/magazine',
115: 'Number_Issue' => 'meta/bib/issue-number',
116: 'Date' => 'meta/bib/issue-date',
117: 'Pages' => 'meta/bib/pages'
118: },
119: 'Report' => {
120: '_name' => 'report',
121: 'Author' => 'meta/bib/author',
122: 'Title' => 'meta/bib/title',
123: 'Year' => 'meta/bib/year',
124: 'Place_Published' => 'meta/bib/city',
125: 'Date' => 'meta/bib/date',
126: 'SecondaryTitle' => 'meta/bib/type',
127: 'Pages' => 'meta/bib/pages'
128: },
129: 'Trade Catalogue' => {
130: '_name' => 'report',
131: 'Author' => 'meta/bib/author',
132: 'Title' => 'meta/bib/title',
133: 'Year' => 'meta/bib/year',
134: 'Place_Published' => 'meta/bib/city',
135: 'Date' => 'meta/bib/date',
136: 'Volume' => 'meta/bib/volume',
137: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
138: 'ReferenceType' => 'meta/bib/type',
139: 'Pages' => 'meta/bib/pages'
140: },
141: 'Thesis' => {
142: '_name' => 'thesis',
143: 'Author' => 'meta/bib/author',
144: 'Title' => 'meta/bib/title',
145: 'Place_Published' => 'meta/bib/city',
146: 'Publisher' => 'meta/bib/university',
147: 'Date' => 'meta/bib/date',
148: 'TypeOfWork' => 'meta/bib/type',
149: 'Pages' => 'meta/bib/number-of-pages'
150: },
151: 'Manuscript' => {
152: '_name' => 'manuscript',
153: 'Author' => 'meta/bib/author',
154: 'Title' => 'meta/bib/title',
155: 'Year' => 'meta/bib/year',
156: 'Place_Published' => 'meta/bib/location',
157: 'Pages' => 'meta/bib/pages'
158: }
159: );
160: # language element
161: my $lang_field = 'Custom2_Language';
162: # languages to iso codes
163: my %lang_map = (
164: 'German' => 'de',
165: 'English' => 'en',
166: 'Italian' => 'it',
167: 'French' => 'fr',
168: 'Latin' => 'la',
169: 'Japanese' => 'ja',
170: 'Dutch' => 'nl',
171: 'Spanish' => 'es',
172: 'Swedish' => 'sv'
173: );
174: # storage fields
175: my $arch_id_field = 'ID';
176:
177: #######################################################
178: # internal parameters
179: #
180:
181: # storage
182: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
183: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
184:
185: # read command line parameters
186: my $args = MPIWGStor::parseargs;
187: if (! scalar(%$args)) {
188: print $help, "\n";
189: exit 1;
190: }
191:
192: # debug level
193: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
194:
195: # simulate action only
196: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
197: logger('DEBUG', "dry-run: $dry_run");
198:
199: # replace existing index files
200: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
201: logger('DEBUG', "replace: $do_replace");
202:
203: # use online mode
204: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
205: logger('DEBUG', "online_mode: $online_mode");
206:
207: # use archive mode
208: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
209: logger('DEBUG', "archive_mode: $archive_mode");
210:
211: # index.meta namespace (not really implemented!)
212: my $namespace = "";
213:
214:
215: my $xml_changed = 0;
216: my $errcnt = 0;
217: my $warncnt = 0;
218:
219: #######################################################
220: # check parameters that were passed to the program
221: #
222: my $infile = $$args{'path'};
223: if (! $infile) {
224: logger("ABORT", "no input file given!");
225: exit 1;
226: }
227: # strip double slashes
228: $infile =~ s/\/\//\//;
229: if (! -f $infile) {
230: logger("ABORT", "input file \'$infile\' doesn't exist!");
231: exit 1;
232: }
233:
234:
235: #######################################################
236: # subroutines
237: #
238:
239:
240: sub find_arch_dir {
241: my ($input_node) = @_;
242: my $dir = "";
243:
244: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
245: #logger('DEBUG', "bibdir: $bib_dir");
246: if ($bib_id) {
247: $dir = "$lib_arch_dir/lit$bib_id";
248: if (-d $dir) {
249: logger('DEBUG', "directory $dir exists");
250: return $dir;
251: }
252: }
253: return;
254: }
255:
256: sub find_permanent_dir {
257: my ($input_node) = @_;
258: my $online_base = '/mpiwg/online/permanent';
259: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
260: if (! $dest_id) {
261: logger('ERROR', "no ID field for online permanent entry");
262: $errcnt++;
263: return;
264: }
265: my $dir = "$online_base/lit$dest_id";
266: return $dir;
267: }
268:
269:
270: sub convert_bib {
271: my ($input_node, $index_root, $index_doc) = @_;
272: my $cnt = 0;
273: my $type = "";
274: my $type_path = "";
275:
276: # process general stuff first
277: foreach my $n ($input_node->getChildNodes()) {
278: my $name = $n->nodeName();
279: my $val = $n->textContent();
280: #logger('DEBUG', " NODE: $name = '$val'");
281: if (exists $gen_map{$name}) {
282: # is a general field
283: if ($name eq $lang_field) {
284: # language field
285: if (not $val) {
286: logger('WARNING', "no language tag");
287: $warncnt++;
288: next;
289: }
290: # convert to iso code
291: if (exists $lang_map{$val}) {
292: $val = $lang_map{$val};
293: } else {
294: logger('ERROR', "unknown language: $val! skipping...");
295: $errcnt++;
296: return 0;
297: }
298: }
299: create_element_path($gen_map{$name}, $index_root, $namespace)
300: ->appendTextNode($val);
301: $cnt++;
302: } elsif (exists $type_map{$name}) {
303: # is a type field
304: $type_path = $type_map{$name};
305: $type = $val;
306: # check with known types
307: if (exists $subtype_map{$val}) {
308: my $indextype = $subtype_map{$val}->{'_name'};
309: create_element_path("$type_path=$indextype", $index_root, $namespace);
310: $cnt++;
311: } else {
312: logger('ERROR', "unknown bib type $val! skipping...");
313: $errcnt++;
314: return 0;
315: }
316: }
317: }
318: # process sub type fields
319: if ($type) {
320: foreach my $n ($input_node->getChildNodes()) {
321: my $name = $n->nodeName();
322: my $val = $n->textContent();
323: #logger('DEBUG', " NODE: $name = '$val'");
324: if (exists $subtype_map{$type}->{$name}) {
325: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
326: ->appendTextNode($val);
327: $cnt++;
328: }
329: }
1.3 casties 330: # append additional constant fields (beginning with #)
331: foreach my $k (keys %{$subtype_map{$type}}) {
332: if ($k =~ /^\#(.*)/) {
333: my $val = $1;
334: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
335: }
336: }
1.1 casties 337: }
338: return $cnt;
339: }
340:
341:
342:
343: sub process_all_fm_entries {
344: my ($input_root) = @_;
345: my $cnt = 0;
346:
347: foreach my $n ($input_root->findnodes('fm:ROW')) {
348: logger('INFO', "processing entry $cnt ...");
349: process_fm_entry($n);
350: $cnt++;
351: }
352: }
353:
354:
355: sub process_fm_entry {
356: my ($input_node) = @_;
357: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
358: my $index_root = $index_doc->createElementNS($namespace, 'resource');
359: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
360: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
361: $index_doc->setDocumentElement($index_root);
362:
363: # try to find the document directory
364: my $doc_dir = "";
365: if ($online_mode) {
366: $doc_dir = find_permanent_dir($input_node);
367: } elsif ($archive_mode) {
368: $doc_dir = find_arch_dir($input_node);
369: } else {
370: $doc_dir = find_permanent_dir($input_node);
371: }
372: if (! $doc_dir) {
373: logger('ERROR', "document directory not found! skipping...");
374: $errcnt++;
375: return;
376: }
377:
378: # check if index.meta exists
379: if ( -f "$doc_dir/index.meta") {
380: if (not $do_replace) {
381: logger('DEBUG', "index file in $doc_dir exists");
382: return;
383: }
384: }
385:
386: # add standard stuff to index.meta
387: my ($docname, $docpath) = split_file_path($doc_dir);
388: # name and date
389: create_text_path('name', $docname, $index_root, $namespace);
390: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
391: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
392: create_text_path('creator', 'vlp', $index_root, $namespace);
393: create_text_path('description', 'a scanned document', $index_root, $namespace);
394: if ($archive_mode) {
395: # acquisition
396: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
397: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
398: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
399: # image acquisition
400: create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
401: create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
402: create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
403: }
404: # media
405: create_text_path('media-type', 'image', $index_root, $namespace);
406: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
407:
408: # convert bib entries
409: my $cnt = convert_bib($input_node, $index_root, $index_doc);
410: if ($cnt == 0) {
411: # error or nothing to convert
412: logger('ERROR', "no bibliographic metadata!");
413: $errcnt++;
414: return;
415: }
416:
417: # write new index.meta file
418: if ($dry_run) {
419: logger('DEBUG', "would write $doc_dir/index.meta");
420: logger('DEBUG', $index_doc->toString(1));
421: } else {
422: write_xml($index_doc, "$doc_dir/index.meta");
423: }
424:
425: }
426:
427:
428:
429:
430:
431: #######################################################
432: # Main
433: #
434:
435: # load filemaker xml dump
436: my ($input_doc, $input_root) = read_xml($infile);
437: # set namespace prefix
438: my $fm_namespace = $input_root->namespaceURI();
439: $input_root->setNamespace($fm_namespace, 'fm', 1);
440:
441:
442: process_all_fm_entries($input_root);
443:
444:
445: logger("INFO", "$warncnt warnings");
446: logger("INFO", "$errcnt errors");
447: if ($errcnt > 0) {
448: logger("ABORT", "there were errors!");
449: exit 1;
450: } else {
451: logger("DONE", "done something successfully!");
452: }
453:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>