1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2.4 (16.5.2006 ROC)";
14: my $help =
15: "use: makemeta-lib [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -online-mode mode for creating online/permanent files
20: -cw-mode mode for copying einstein_cw archive documents
21: -digifiles-mode mode for copying files from digifiles
22: -map-file=mapfile.xml digilib mapping file (for digifiles mode)
23: -access=free adds free access tag (use access=mpiwg for restricted access)
24: ";
25: logger("INFO", "makemeta-lib $version");
26:
27: ###########################################
28: # mappings
29:
30: # generic mappings at top level
31: my %gen_map = (
32: 'Device' => 'meta/image-acquisition/device',
33: 'Image_Type' => 'meta/image-acquisition/image-type',
34: 'Production_Comment' => 'meta/image-acquisition/production-comment',
35: 'Postproduction' => 'meta/image-acquisition/production-comment',
36: 'Language' => 'meta/lang'
37: );
38: # sub type switch tag
39: my %type_map = (
40: 'Reference_Type' => 'meta/bib@type'
41: );
42: # sub type mappings
43: my %subtype_map = (
44: 'Book' => {
45: '_name' => 'book',
46: 'Author' => 'meta/bib/author',
47: 'Title' => 'meta/bib/title',
48: 'Year' => 'meta/bib/year',
49: 'Place_Published' => 'meta/bib/city',
50: 'Publisher' => 'meta/bib/publisher',
51: 'Edition' => 'meta/bib/edition'
52: },
53: 'Journal Article' => {
54: '_name' => 'journal-article',
55: 'Author' => 'meta/bib/author',
56: 'Title' => 'meta/bib/title',
57: 'Year' => 'meta/bib/year',
58: 'Secondary_Title' => 'meta/bib/journal',
59: 'Volume' => 'meta/bib/volume',
60: 'Number' => 'meta/bib/issue',
61: 'Pages' => 'meta/bib/pages'
62: },
63: 'In Book' => {
64: '_name' => 'inbook',
65: 'Author' => 'meta/bib/author',
66: 'Title' => 'meta/bib/title',
67: 'Year' => 'meta/bib/year',
68: 'Secondary_Title' => 'meta/bib/book-title',
69: 'Pages' => 'meta/bib/pages'
70: },
71: 'Newspaper Article' => {
72: '_name' => 'newspaper-article',
73: 'Author' => 'meta/bib/author',
74: 'Title' => 'meta/bib/title',
75: 'Year' => 'meta/bib/year',
76: 'Secondary_Title' => 'meta/bib/newspaper',
77: 'Place_Published' => 'meta/bib/city',
78: 'Number' => 'meta/bib/issue-date',
79: 'Pages' => 'meta/bib/pages'
80: },
81: 'Edited Book' => {
82: '_name' => 'edited-book',
83: 'Author' => 'meta/bib/editor',
84: 'Title' => 'meta/bib/title',
85: 'Year' => 'meta/bib/year',
86: 'Place_Published' => 'meta/bib/city',
87: 'Publisher' => 'meta/bib/publisher',
88: 'Edition' => 'meta/bib/edition'
89: },
90: 'Manuscript' => {
91: '_name' => 'manuscript',
92: 'Author' => 'meta/bib/author',
93: 'Title' => 'meta/bib/title',
94: 'Year' => 'meta/bib/year',
95: 'Place_Published' => 'meta/bib/location',
96: }
97: );
98: # language element
99: my $lang_field = 'Language';
100: # languages to iso codes
101: my %lang_map = (
102: 'German' => 'de',
103: 'English' => 'en',
104: 'Italian' => 'it',
105: 'French' => 'fr',
106: 'Latin' => 'la',
107: 'Japanese' => 'ja',
108: 'Dutch' => 'nl',
109: 'Spanish' => 'es'
110: );
111: # storage fields
112: my $arch_id_field = 'ID_Archive';
113: my $online_url_field = 'URL';
114: my $online_id_field = 'ID_OnlinePermanent';
115:
116: #######################################################
117: # internal parameters
118: #
119:
120: # storage
121: my $lib_arch_dir = '/mpiwg/archive/data/library';
122: my $lib_online_dir = '/mpiwg/online/permanent';
123: my $lib_digilib_path = 'permanent';
124: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
125:
126: # read command line parameters
127: my $args = MPIWGStor::parseargs;
128: if (! scalar(%$args)) {
129: print $help, "\n";
130: exit 1;
131: }
132:
133: # debug level
134: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
135:
136: # simulate action only
137: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
138: logger('DEBUG', "dry-run: $dry_run");
139:
140: # use online mode
141: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
142: logger('DEBUG', "online_mode: $online_mode");
143:
144: # use einstein-cw mode
145: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
146: logger('DEBUG', "cw_mode: $cw_mode");
147:
148: # use digifiles mode
149: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
150: logger('DEBUG', "digifiles_mode: $digifiles_mode");
151: # digilib mapping file
152: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
153: logger('DEBUG', "map_file_name: $map_file_name");
154: my $mapping_doc;
155: my $mapping_root;
156:
157: # access type
158: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
159:
160: # index.meta namespace (not really implemented!)
161: my $namespace = "";
162:
163:
164: my $xml_changed = 0;
165: my $errcnt = 0;
166: my $warncnt = 0;
167:
168: #######################################################
169: # check parameters that were passed to the program
170: #
171: my $infile = $$args{'path'};
172: if (! $infile) {
173: logger("ABORT", "no input file given!");
174: exit 1;
175: }
176: # strip double slashes
177: $infile =~ s/\/\//\//;
178: if (! -f $infile) {
179: logger("ABORT", "input file \'$infile\' doesn't exist!");
180: exit 1;
181: }
182:
183:
184: #######################################################
185: # subroutines
186: #
187:
188:
189: sub add_digilib_mapping {
190: my ($src_dir, $dest_dir) = @_;
191: my $elem = $mapping_root->addNewChild($namespace, 'mapping');
192: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
193: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
194: if ($map_file_name) {
195: write_xml($mapping_doc, $map_file_name);
196: } else {
197: logger('ABORT', "unable to write mapping file!");
198: exit 1;
199: }
200: }
201:
202: sub find_digifiles_dir {
203: my ($input_node) = @_;
204: my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
205: my $src_dir = find_online_dir($input_node, $digifiles_base, '');
206: if (! $src_dir) {
207: logger('ERROR', "no online directory for digifiles entry");
208: $errcnt++;
209: return;
210: }
211: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
212: if (! $dest_id) {
213: logger('ERROR', "no ID field for digifiles entry");
214: $errcnt++;
215: return;
216: }
217: my $dir = "$lib_online_dir/library/$dest_id";
218: my $map_dir = "$lib_digilib_path/library/$dest_id";
219: if ($dry_run) {
220: logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
221: add_digilib_mapping($src_dir, "$map_dir/pageimg");
222: return $dir;
223: } else {
224: logger('INFO', "moving $digifiles_base/$src_dir to $dir");
225: logger('DEBUG', "mkdir $dir/pageimg");
226: if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
227: logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
228: if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
229: if (-d "$dir/pageimg") {
230: logger('DEBUG', "directory $dir OK");
231: add_digilib_mapping($src_dir, "$map_dir/pageimg");
232: if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
233: logger('DEBUG', "directory $digifiles_base/$src_dir removed");
234: return $dir;
235: } else {
236: logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
237: $errcnt++;
238: return $dir;
239: }
240: }
241: }
242: }
243: logger('ABORT', "unable to copy directory $src_dir to $dir!");
244: exit 1;
245: }
246: return;
247: }
248:
249: sub find_cw_dir {
250: my ($input_node) = @_;
251: my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
252: my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
253: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
254: if (! $dest_id) {
255: logger('ERROR', "no ID field for einstein-cw entry");
256: $errcnt++;
257: return;
258: }
259: my $dir = "$lib_arch_dir/$dest_id";
260: if ($dry_run) {
261: logger('DEBUG', "would move $cw_base/$src_dir to $dir");
262: return $dir;
263: } else {
264: logger('DEBUG', "moving $cw_base/$src_dir to $dir");
265: if (rename "$cw_base/$src_dir", $dir) {
266: if (-d $dir) {
267: logger('DEBUG', "directory $dir OK");
268: return $dir;
269: }
270: } else {
271: logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
272: exit 1;
273: }
274: }
275: return;
276: }
277:
278: sub find_permanent_dir {
279: my ($input_node) = @_;
280: my $online_base = '/mpiwg/online/permanent';
281: my $src_dir = find_online_dir($input_node, $online_base, 'pageimg');
282: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
283: if ((! $dest_id)||(! $src_dir)) {
284: logger('ERROR', "no ID field for online permanent entry");
285: $errcnt++;
286: return;
287: }
288: my $dir = "$online_base/$src_dir";
289: return $dir;
290: }
291:
292: #
293: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
294: #
295: # Takes the path from the $online_url_field of the $input_node document
296: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
297: # Returns the directory path sans $base_dir if it exists
298: #
299: sub find_online_dir {
300: my ($input_node, $base_dir, $page_dir) = @_;
301: $base_dir = $lib_online_dir unless ($base_dir);
302:
303: my $online_url = $input_node->findvalue("fm:$online_url_field");
304: logger('DEBUG', "checking URL: $online_url");
305: my $online_dir;
306: if ($online_url =~ /fn=permanent\/(.+)/) {
307: # new style digilib URL
308: $online_dir = $1;
309: } elsif ($online_url =~ /\?([^\+]+)\+/) {
310: # old style digilib URL
311: $online_dir = $1;
312: }
313: #logger('DEBUG', "online_dir1: $online_dir");
314: if ($online_dir) {
315: $online_dir =~ s/\/$//; # strip ending slashes
316: if ($page_dir) {
317: $online_dir =~ s/\/${page_dir}$//;
318: }
319: #logger("DEBUG", "dir: $base_dir/$online_dir");
320: if (-d "$base_dir/$online_dir") {
321: logger('DEBUG', "directory $base_dir/$online_dir exists");
322: return $online_dir;
323: }
324: }
325: return;
326: }
327:
328: sub find_arch_dir {
329: my ($input_node) = @_;
330: my $dir = "";
331:
332: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
333: #logger('DEBUG', "bibdir: $bib_dir");
334: if ($bib_dir) {
335: $dir = "$lib_arch_dir/$bib_dir";
336: if (-d $dir) {
337: logger('DEBUG', "directory $dir exists");
338: return $dir;
339: }
340: }
341: return;
342: }
343:
344:
345: sub convert_bib {
346: my ($input_node, $index_root, $index_doc) = @_;
347: my $cnt = 0;
348: my $type = "";
349: my $type_path = "";
350:
351: # process general stuff first
352: foreach my $n ($input_node->getChildNodes()) {
353: my $name = $n->nodeName();
354: my $val = $n->textContent();
355: #logger('DEBUG', " NODE: $name = '$val'");
356: if (exists $gen_map{$name}) {
357: # is a general field
358: if ($name eq $lang_field) {
359: # language field -> convert to iso code
360: if (exists $lang_map{$val}) {
361: $val = $lang_map{$val};
362: } else {
363: logger('ERROR', "unknown language: $val! skipping...");
364: $errcnt++;
365: return 0;
366: }
367: }
368: create_element_path($gen_map{$name}, $index_root, $namespace)
369: ->appendTextNode($val);
370: $cnt++;
371: } elsif (exists $type_map{$name}) {
372: # is a type field
373: $type_path = $type_map{$name};
374: $type = $val;
375: # check with known types
376: if (exists $subtype_map{$val}) {
377: my $indextype = $subtype_map{$val}->{'_name'};
378: create_element_path("$type_path=$indextype", $index_root, $namespace);
379: $cnt++;
380: } else {
381: logger('ERROR', 'unknown bib type $val! skipping...');
382: $errcnt++;
383: return 0;
384: }
385: }
386: }
387: # process sub type fields
388: if ($type) {
389: foreach my $n ($input_node->getChildNodes()) {
390: my $name = $n->nodeName();
391: my $val = $n->textContent();
392: #logger('DEBUG', " NODE: $name = '$val'");
393: if (exists $subtype_map{$type}->{$name}) {
394: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
395: ->appendTextNode($val);
396: $cnt++;
397: }
398: }
399: }
400: return $cnt;
401: }
402:
403:
404:
405: sub process_all_fm_entries {
406: my ($input_root) = @_;
407: my $cnt = 0;
408:
409: foreach my $n ($input_root->findnodes('fm:ROW')) {
410: logger('INFO', "processing entry $cnt ...");
411: process_fm_entry($n);
412: $cnt++;
413: }
414: }
415:
416:
417: sub process_fm_entry {
418: my ($input_node) = @_;
419: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
420: my $index_root = $index_doc->createElementNS($namespace, 'resource');
421: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
422: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
423: $index_doc->setDocumentElement($index_root);
424: my $derived_from = "";
425:
426: # try to find the document directory
427: my $doc_dir = "";
428: if ($online_mode) {
429: $doc_dir = find_permanent_dir($input_node);
430: $derived_from = find_arch_dir($input_node);
431: } elsif ($cw_mode) {
432: $doc_dir = find_cw_dir($input_node);
433: } elsif ($digifiles_mode) {
434: $doc_dir = find_digifiles_dir($input_node);
435: } else {
436: $doc_dir = find_arch_dir($input_node);
437: }
438: if (! $doc_dir) {
439: logger('ERROR', "document directory not found! skipping...");
440: $errcnt++;
441: return;
442: }
443:
444: # add standard stuff to index.meta
445: my ($docname, $docpath) = split_file_path($doc_dir);
446: # name and date
447: create_text_path('name', $docname, $index_root, $namespace);
448: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
449: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
450: create_text_path('creator', 'digigroup', $index_root, $namespace);
451: create_text_path('description', 'a scanned document', $index_root, $namespace);
452: # acquisition
453: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
454: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
455: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
456: # media
457: create_text_path('media-type', 'image', $index_root, $namespace);
458: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
459: # derived-from
460: if ($derived_from) {
461: create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
462: }
463: # access
464: if ($access_type) {
465: if ($access_type eq "free") {
466: create_element_path('access-conditions/access@type=free', $index_root, $namespace);
467: } else {
468: my $acc_tag = create_element_path('access-conditions/access@type=institution', $index_root, $namespace);
469: create_text_path('name', $access_type, $acc_tag, $namespace);
470: }
471: }
472:
473: # convert bib entries
474: my $cnt = convert_bib($input_node, $index_root, $index_doc);
475: if ($cnt == 0) {
476: # error or nothing to convert
477: logger('ERROR', "no bibliographic metadata!");
478: $errcnt++;
479: return;
480: }
481:
482: # write new index.meta file
483: if ($dry_run) {
484: logger('DEBUG', "would write $doc_dir/index.meta");
485: logger('DEBUG', $index_doc->toString(1));
486: } else {
487: write_xml($index_doc, "$doc_dir/index.meta");
488: }
489:
490: }
491:
492:
493:
494:
495:
496: #######################################################
497: # Main
498: #
499:
500: # load filemaker xml dump
501: my ($input_doc, $input_root) = read_xml($infile);
502: # set namespace prefix
503: my $fm_namespace = $input_root->namespaceURI();
504: $input_root->setNamespace($fm_namespace, 'fm', 1);
505:
506: # create digilib mapping file for digifiles mode
507: if ($digifiles_mode) {
508: $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
509: $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
510: $mapping_doc->setDocumentElement($mapping_root);
511: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
512:
513: }
514:
515: process_all_fm_entries($input_root);
516:
517:
518: logger("INFO", "$warncnt warnings");
519: logger("INFO", "$errcnt errors");
520: if ($errcnt > 0) {
521: logger("ABORT", "there were errors!");
522: exit 1;
523: } else {
524: logger("DONE", "done something successfully!");
525: }
526:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>