Annotation of foxridge-archiver/makemeta-lib.pl, revision 1.10
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.7 casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.10 ! casties 13: my $version = "0.2.5 (9.6.2006 ROC)";
1.5 casties 14: my $help =
15: "use: makemeta-lib [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
1.6 casties 19: -online-mode mode for creating online/permanent files
1.10 ! casties 20: -online-base=dir base directory for online ids (for online mode)
1.6 casties 21: -cw-mode mode for copying einstein_cw archive documents
1.5 casties 22: -digifiles-mode mode for copying files from digifiles
23: -map-file=mapfile.xml digilib mapping file (for digifiles mode)
1.9 casties 24: -access=free adds free access tag (use access=mpiwg for restricted access)
1.5 casties 25: ";
1.1 casties 26: logger("INFO", "makemeta-lib $version");
27:
1.2 casties 28: ###########################################
1.1 casties 29: # mappings
1.2 casties 30:
1.1 casties 31: # generic mappings at top level
32: my %gen_map = (
33: 'Device' => 'meta/image-acquisition/device',
34: 'Image_Type' => 'meta/image-acquisition/image-type',
35: 'Production_Comment' => 'meta/image-acquisition/production-comment',
36: 'Postproduction' => 'meta/image-acquisition/production-comment',
37: 'Language' => 'meta/lang'
38: );
39: # sub type switch tag
40: my %type_map = (
41: 'Reference_Type' => 'meta/bib@type'
42: );
43: # sub type mappings
44: my %subtype_map = (
45: 'Book' => {
46: '_name' => 'book',
47: 'Author' => 'meta/bib/author',
48: 'Title' => 'meta/bib/title',
49: 'Year' => 'meta/bib/year',
50: 'Place_Published' => 'meta/bib/city',
51: 'Publisher' => 'meta/bib/publisher',
52: 'Edition' => 'meta/bib/edition'
53: },
54: 'Journal Article' => {
55: '_name' => 'journal-article',
56: 'Author' => 'meta/bib/author',
57: 'Title' => 'meta/bib/title',
58: 'Year' => 'meta/bib/year',
59: 'Secondary_Title' => 'meta/bib/journal',
60: 'Volume' => 'meta/bib/volume',
61: 'Number' => 'meta/bib/issue',
62: 'Pages' => 'meta/bib/pages'
63: },
64: 'In Book' => {
65: '_name' => 'inbook',
66: 'Author' => 'meta/bib/author',
67: 'Title' => 'meta/bib/title',
68: 'Year' => 'meta/bib/year',
69: 'Secondary_Title' => 'meta/bib/book-title',
70: 'Pages' => 'meta/bib/pages'
71: },
72: 'Newspaper Article' => {
73: '_name' => 'newspaper-article',
74: 'Author' => 'meta/bib/author',
75: 'Title' => 'meta/bib/title',
76: 'Year' => 'meta/bib/year',
77: 'Secondary_Title' => 'meta/bib/newspaper',
78: 'Place_Published' => 'meta/bib/city',
79: 'Number' => 'meta/bib/issue-date',
80: 'Pages' => 'meta/bib/pages'
81: },
82: 'Edited Book' => {
83: '_name' => 'edited-book',
84: 'Author' => 'meta/bib/editor',
85: 'Title' => 'meta/bib/title',
86: 'Year' => 'meta/bib/year',
87: 'Place_Published' => 'meta/bib/city',
88: 'Publisher' => 'meta/bib/publisher',
89: 'Edition' => 'meta/bib/edition'
90: },
91: 'Manuscript' => {
92: '_name' => 'manuscript',
93: 'Author' => 'meta/bib/author',
94: 'Title' => 'meta/bib/title',
95: 'Year' => 'meta/bib/year',
96: 'Place_Published' => 'meta/bib/location',
97: }
98: );
99: # language element
100: my $lang_field = 'Language';
101: # languages to iso codes
102: my %lang_map = (
103: 'German' => 'de',
104: 'English' => 'en',
105: 'Italian' => 'it',
106: 'French' => 'fr',
1.2 casties 107: 'Latin' => 'la',
1.3 casties 108: 'Japanese' => 'ja',
1.4 casties 109: 'Dutch' => 'nl',
1.2 casties 110: 'Spanish' => 'es'
1.1 casties 111: );
112: # storage fields
113: my $arch_id_field = 'ID_Archive';
114: my $online_url_field = 'URL';
1.5 casties 115: my $online_id_field = 'ID_OnlinePermanent';
1.1 casties 116:
1.2 casties 117: #######################################################
118: # internal parameters
119: #
120:
121: # storage
1.1 casties 122: my $lib_arch_dir = '/mpiwg/archive/data/library';
123: my $lib_online_dir = '/mpiwg/online/permanent';
1.5 casties 124: my $lib_digilib_path = 'permanent';
125: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
1.1 casties 126:
127: # read command line parameters
128: my $args = MPIWGStor::parseargs;
1.5 casties 129: if (! scalar(%$args)) {
130: print $help, "\n";
131: exit 1;
132: }
1.1 casties 133:
134: # debug level
135: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
136:
1.5 casties 137: # simulate action only
138: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
139: logger('DEBUG', "dry-run: $dry_run");
140:
1.6 casties 141: # use online mode
142: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
143: logger('DEBUG', "online_mode: $online_mode");
1.10 ! casties 144: # online base dir
! 145: my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "";
! 146: logger('DEBUG', "online_base_dir: $online_base_dir");
1.6 casties 147:
1.1 casties 148: # use einstein-cw mode
149: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
1.5 casties 150: logger('DEBUG', "cw_mode: $cw_mode");
151:
152: # use digifiles mode
153: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
154: logger('DEBUG', "digifiles_mode: $digifiles_mode");
155: # digilib mapping file
156: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
157: logger('DEBUG', "map_file_name: $map_file_name");
158: my $mapping_doc;
159: my $mapping_root;
1.1 casties 160:
1.9 casties 161: # access type
162: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
163:
1.1 casties 164: # index.meta namespace (not really implemented!)
165: my $namespace = "";
166:
167:
168: my $xml_changed = 0;
169: my $errcnt = 0;
170: my $warncnt = 0;
171:
172: #######################################################
173: # check parameters that were passed to the program
174: #
175: my $infile = $$args{'path'};
176: if (! $infile) {
177: logger("ABORT", "no input file given!");
178: exit 1;
179: }
180: # strip double slashes
1.10 ! casties 181: $infile = sstrip($infile, 1);
1.1 casties 182: if (! -f $infile) {
183: logger("ABORT", "input file \'$infile\' doesn't exist!");
184: exit 1;
185: }
186:
187:
188: #######################################################
189: # subroutines
190: #
191:
1.5 casties 192:
193: sub add_digilib_mapping {
194: my ($src_dir, $dest_dir) = @_;
195: my $elem = $mapping_root->addNewChild($namespace, 'mapping');
196: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
197: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
198: if ($map_file_name) {
199: write_xml($mapping_doc, $map_file_name);
200: } else {
201: logger('ABORT', "unable to write mapping file!");
202: exit 1;
203: }
204: }
205:
206: sub find_digifiles_dir {
207: my ($input_node) = @_;
208: my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
209: my $src_dir = find_online_dir($input_node, $digifiles_base, '');
210: if (! $src_dir) {
211: logger('ERROR', "no online directory for digifiles entry");
212: $errcnt++;
213: return;
214: }
215: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
216: if (! $dest_id) {
217: logger('ERROR', "no ID field for digifiles entry");
218: $errcnt++;
219: return;
220: }
221: my $dir = "$lib_online_dir/library/$dest_id";
222: my $map_dir = "$lib_digilib_path/library/$dest_id";
223: if ($dry_run) {
224: logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
225: add_digilib_mapping($src_dir, "$map_dir/pageimg");
226: return $dir;
227: } else {
228: logger('INFO', "moving $digifiles_base/$src_dir to $dir");
229: logger('DEBUG', "mkdir $dir/pageimg");
230: if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
231: logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
232: if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
233: if (-d "$dir/pageimg") {
234: logger('DEBUG', "directory $dir OK");
235: add_digilib_mapping($src_dir, "$map_dir/pageimg");
236: if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
237: logger('DEBUG', "directory $digifiles_base/$src_dir removed");
238: return $dir;
239: } else {
240: logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
241: $errcnt++;
242: return $dir;
243: }
244: }
245: }
246: }
247: logger('ABORT', "unable to copy directory $src_dir to $dir!");
248: exit 1;
249: }
250: return;
251: }
252:
1.1 casties 253: sub find_cw_dir {
254: my ($input_node) = @_;
1.5 casties 255: my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
256: my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
257: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
1.1 casties 258: if (! $dest_id) {
259: logger('ERROR', "no ID field for einstein-cw entry");
260: $errcnt++;
261: return;
262: }
263: my $dir = "$lib_arch_dir/$dest_id";
1.5 casties 264: if ($dry_run) {
265: logger('DEBUG', "would move $cw_base/$src_dir to $dir");
266: return $dir;
267: } else {
268: logger('DEBUG', "moving $cw_base/$src_dir to $dir");
269: if (rename "$cw_base/$src_dir", $dir) {
270: if (-d $dir) {
271: logger('DEBUG', "directory $dir OK");
272: return $dir;
273: }
274: } else {
275: logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
276: exit 1;
1.1 casties 277: }
278: }
279: return;
280: }
281:
1.6 casties 282: sub find_permanent_dir {
283: my ($input_node) = @_;
1.10 ! casties 284: my $online_id = sstrip($input_node->findvalue("fm:$online_id_field"));
! 285: # try online_base_dir + online_id first
! 286: if (($online_base_dir)&&($online_id)) {
! 287: my $dir = sstrip("$online_base_dir/$online_id", 1);
! 288: return $dir;
! 289: }
! 290: # then online_url
1.6 casties 291: my $online_base = '/mpiwg/online/permanent';
1.10 ! casties 292: my $online_dir = find_online_dir($input_node, $online_base, 'pageimg');
! 293: if ((! $online_dir)) {
! 294: logger('ERROR', "no ID or URL for online permanent entry");
1.6 casties 295: $errcnt++;
296: return;
297: }
1.10 ! casties 298: my $dir = sstrip("$online_base/$online_dir", 1);
1.6 casties 299: return $dir;
300: }
301:
1.5 casties 302: #
303: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
304: #
305: # Takes the path from the $online_url_field of the $input_node document
306: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
307: # Returns the directory path sans $base_dir if it exists
308: #
1.1 casties 309: sub find_online_dir {
1.5 casties 310: my ($input_node, $base_dir, $page_dir) = @_;
1.1 casties 311: $base_dir = $lib_online_dir unless ($base_dir);
312:
313: my $online_url = $input_node->findvalue("fm:$online_url_field");
1.5 casties 314: logger('DEBUG', "checking URL: $online_url");
315: my $online_dir;
316: if ($online_url =~ /fn=permanent\/(.+)/) {
317: # new style digilib URL
318: $online_dir = $1;
319: } elsif ($online_url =~ /\?([^\+]+)\+/) {
320: # old style digilib URL
321: $online_dir = $1;
322: }
323: #logger('DEBUG', "online_dir1: $online_dir");
324: if ($online_dir) {
1.6 casties 325: $online_dir =~ s/\/$//; # strip ending slashes
1.5 casties 326: if ($page_dir) {
1.10 ! casties 327: # strip page_dir
! 328: $online_dir =~ s/\/${page_dir}$//;
1.5 casties 329: }
1.1 casties 330: #logger("DEBUG", "dir: $base_dir/$online_dir");
1.5 casties 331: if (-d "$base_dir/$online_dir") {
1.6 casties 332: logger('DEBUG', "directory $base_dir/$online_dir exists");
1.5 casties 333: return $online_dir;
1.1 casties 334: }
335: }
336: return;
337: }
338:
339: sub find_arch_dir {
340: my ($input_node) = @_;
341: my $dir = "";
342:
343: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
344: #logger('DEBUG', "bibdir: $bib_dir");
345: if ($bib_dir) {
346: $dir = "$lib_arch_dir/$bib_dir";
347: if (-d $dir) {
348: logger('DEBUG', "directory $dir exists");
349: return $dir;
350: }
351: }
352: return;
353: }
354:
355:
356: sub convert_bib {
357: my ($input_node, $index_root, $index_doc) = @_;
358: my $cnt = 0;
359: my $type = "";
360: my $type_path = "";
361:
362: # process general stuff first
363: foreach my $n ($input_node->getChildNodes()) {
364: my $name = $n->nodeName();
365: my $val = $n->textContent();
366: #logger('DEBUG', " NODE: $name = '$val'");
367: if (exists $gen_map{$name}) {
368: # is a general field
369: if ($name eq $lang_field) {
370: # language field -> convert to iso code
371: if (exists $lang_map{$val}) {
372: $val = $lang_map{$val};
373: } else {
374: logger('ERROR', "unknown language: $val! skipping...");
375: $errcnt++;
376: return 0;
377: }
378: }
379: create_element_path($gen_map{$name}, $index_root, $namespace)
380: ->appendTextNode($val);
381: $cnt++;
382: } elsif (exists $type_map{$name}) {
383: # is a type field
384: $type_path = $type_map{$name};
385: $type = $val;
386: # check with known types
387: if (exists $subtype_map{$val}) {
388: my $indextype = $subtype_map{$val}->{'_name'};
389: create_element_path("$type_path=$indextype", $index_root, $namespace);
390: $cnt++;
391: } else {
392: logger('ERROR', 'unknown bib type $val! skipping...');
393: $errcnt++;
394: return 0;
395: }
396: }
397: }
398: # process sub type fields
399: if ($type) {
400: foreach my $n ($input_node->getChildNodes()) {
401: my $name = $n->nodeName();
402: my $val = $n->textContent();
403: #logger('DEBUG', " NODE: $name = '$val'");
404: if (exists $subtype_map{$type}->{$name}) {
405: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
406: ->appendTextNode($val);
407: $cnt++;
408: }
409: }
410: }
411: return $cnt;
412: }
413:
414:
415:
416: sub process_all_fm_entries {
417: my ($input_root) = @_;
418: my $cnt = 0;
419:
420: foreach my $n ($input_root->findnodes('fm:ROW')) {
421: logger('INFO', "processing entry $cnt ...");
422: process_fm_entry($n);
1.5 casties 423: $cnt++;
1.1 casties 424: }
425: }
426:
427:
428: sub process_fm_entry {
429: my ($input_node) = @_;
430: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
431: my $index_root = $index_doc->createElementNS($namespace, 'resource');
432: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
433: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
434: $index_doc->setDocumentElement($index_root);
1.8 casties 435: my $derived_from = "";
1.1 casties 436:
437: # try to find the document directory
438: my $doc_dir = "";
1.6 casties 439: if ($online_mode) {
440: $doc_dir = find_permanent_dir($input_node);
1.8 casties 441: $derived_from = find_arch_dir($input_node);
1.6 casties 442: } elsif ($cw_mode) {
1.1 casties 443: $doc_dir = find_cw_dir($input_node);
1.5 casties 444: } elsif ($digifiles_mode) {
445: $doc_dir = find_digifiles_dir($input_node);
1.1 casties 446: } else {
447: $doc_dir = find_arch_dir($input_node);
448: }
449: if (! $doc_dir) {
450: logger('ERROR', "document directory not found! skipping...");
451: $errcnt++;
452: return;
453: }
454:
455: # add standard stuff to index.meta
456: my ($docname, $docpath) = split_file_path($doc_dir);
457: # name and date
458: create_text_path('name', $docname, $index_root, $namespace);
459: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
460: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
461: create_text_path('creator', 'digigroup', $index_root, $namespace);
462: create_text_path('description', 'a scanned document', $index_root, $namespace);
463: # acquisition
464: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
465: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
466: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
467: # media
468: create_text_path('media-type', 'image', $index_root, $namespace);
469: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
1.8 casties 470: # derived-from
471: if ($derived_from) {
472: create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
1.9 casties 473: }
474: # access
475: if ($access_type) {
476: if ($access_type eq "free") {
477: create_element_path('access-conditions/access@type=free', $index_root, $namespace);
478: } else {
479: my $acc_tag = create_element_path('access-conditions/access@type=institution', $index_root, $namespace);
480: create_text_path('name', $access_type, $acc_tag, $namespace);
481: }
482: }
1.1 casties 483:
484: # convert bib entries
485: my $cnt = convert_bib($input_node, $index_root, $index_doc);
486: if ($cnt == 0) {
487: # error or nothing to convert
488: logger('ERROR', "no bibliographic metadata!");
489: $errcnt++;
490: return;
491: }
492:
493: # write new index.meta file
1.5 casties 494: if ($dry_run) {
495: logger('DEBUG', "would write $doc_dir/index.meta");
496: logger('DEBUG', $index_doc->toString(1));
497: } else {
498: write_xml($index_doc, "$doc_dir/index.meta");
499: }
1.1 casties 500:
501: }
502:
503:
504:
505:
506:
507: #######################################################
508: # Main
509: #
510:
511: # load filemaker xml dump
512: my ($input_doc, $input_root) = read_xml($infile);
513: # set namespace prefix
514: my $fm_namespace = $input_root->namespaceURI();
515: $input_root->setNamespace($fm_namespace, 'fm', 1);
1.5 casties 516:
517: # create digilib mapping file for digifiles mode
518: if ($digifiles_mode) {
519: $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
520: $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
521: $mapping_doc->setDocumentElement($mapping_root);
522: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
523:
524: }
1.1 casties 525:
526: process_all_fm_entries($input_root);
527:
528:
529: logger("INFO", "$warncnt warnings");
530: logger("INFO", "$errcnt errors");
531: if ($errcnt > 0) {
532: logger("ABORT", "there were errors!");
533: exit 1;
534: } else {
535: logger("DONE", "done something successfully!");
536: }
537:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>