1: #!/usr/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2.6 (11.12.2006 ROC)";
14: my $help =
15: "use: makemeta-lib [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -online-mode mode for creating online/permanent files
20: -online-base=dir base directory for online ids (for online mode)
21: -cw-mode mode for copying einstein_cw archive documents
22: -digifiles-mode mode for copying files from digifiles
23: -map-file=mapfile.xml digilib mapping file (for digifiles mode)
24: -access=free adds free access tag (use access=mpiwg for restricted access)
25: ";
26: logger("INFO", "makemeta-lib $version");
27:
28: ###########################################
29: # mappings
30:
31: # generic mappings at top level
32: my %gen_map = (
33: 'Device' => 'meta/image-acquisition/device',
34: 'Image_Type' => 'meta/image-acquisition/image-type',
35: 'Production_Comment' => 'meta/image-acquisition/production-comment',
36: 'Postproduction' => 'meta/image-acquisition/production-comment',
37: 'Language' => 'meta/lang'
38: );
39: # sub type switch tag
40: my %type_map = (
41: 'Reference_Type' => 'meta/bib@type'
42: );
43: # sub type mappings
44: my %subtype_map = (
45: 'Book' => {
46: '_name' => 'book',
47: 'Author' => 'meta/bib/author',
48: 'Title' => 'meta/bib/title',
49: 'Year' => 'meta/bib/year',
50: 'Place_Published' => 'meta/bib/city',
51: 'Publisher' => 'meta/bib/publisher',
52: 'Edition' => 'meta/bib/edition'
53: },
54: 'Journal Article' => {
55: '_name' => 'journal-article',
56: 'Author' => 'meta/bib/author',
57: 'Title' => 'meta/bib/title',
58: 'Year' => 'meta/bib/year',
59: 'Secondary_Title' => 'meta/bib/journal',
60: 'Volume' => 'meta/bib/volume',
61: 'Number' => 'meta/bib/issue',
62: 'Pages' => 'meta/bib/pages'
63: },
64: 'In Book' => {
65: '_name' => 'inbook',
66: 'Author' => 'meta/bib/author',
67: 'Title' => 'meta/bib/title',
68: 'Year' => 'meta/bib/year',
69: 'Secondary_Title' => 'meta/bib/book-title',
70: 'Pages' => 'meta/bib/pages'
71: },
72: 'Newspaper Article' => {
73: '_name' => 'newspaper-article',
74: 'Author' => 'meta/bib/author',
75: 'Title' => 'meta/bib/title',
76: 'Year' => 'meta/bib/year',
77: 'Secondary_Title' => 'meta/bib/newspaper',
78: 'Place_Published' => 'meta/bib/city',
79: 'Number' => 'meta/bib/issue-date',
80: 'Pages' => 'meta/bib/pages'
81: },
82: 'Edited Book' => {
83: '_name' => 'edited-book',
84: 'Author' => 'meta/bib/editor',
85: 'Title' => 'meta/bib/title',
86: 'Year' => 'meta/bib/year',
87: 'Place_Published' => 'meta/bib/city',
88: 'Publisher' => 'meta/bib/publisher',
89: 'Edition' => 'meta/bib/edition'
90: },
91: 'Manuscript' => {
92: '_name' => 'manuscript',
93: 'Author' => 'meta/bib/author',
94: 'Title' => 'meta/bib/title',
95: 'Year' => 'meta/bib/year',
96: 'Place_Published' => 'meta/bib/location',
97: }
98: );
99: # language element
100: my $lang_field = 'Language';
101: # languages to iso codes
102: my %lang_map = (
103: 'German' => 'de',
104: 'English' => 'en',
105: 'Italian' => 'it',
106: 'French' => 'fr',
107: 'Latin' => 'la',
108: 'Japanese' => 'ja',
109: 'Dutch' => 'nl',
110: 'Spanish' => 'es',
111: 'Swedish' => 'sv'
112: );
113: # storage fields
114: my $arch_id_field = 'ID_Archive';
115: my $online_url_field = 'URL';
116: my $online_id_field = 'ID_OnlinePermanent';
117:
118: #######################################################
119: # internal parameters
120: #
121:
122: # storage
123: my $lib_arch_dir = '/mpiwg/archive/data/library';
124: my $lib_online_dir = '/mpiwg/online/permanent';
125: my $lib_digilib_path = 'permanent';
126: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
127:
128: # read command line parameters
129: my $args = MPIWGStor::parseargs;
130: if (! scalar(%$args)) {
131: print $help, "\n";
132: exit 1;
133: }
134:
135: # debug level
136: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
137:
138: # simulate action only
139: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
140: logger('DEBUG', "dry-run: $dry_run");
141:
142: # use online mode
143: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
144: logger('DEBUG', "online_mode: $online_mode");
145: # online base dir
146: my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "";
147: logger('DEBUG', "online_base_dir: $online_base_dir");
148:
149: # use einstein-cw mode
150: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
151: logger('DEBUG', "cw_mode: $cw_mode");
152:
153: # use digifiles mode
154: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
155: logger('DEBUG', "digifiles_mode: $digifiles_mode");
156: # digilib mapping file
157: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
158: logger('DEBUG', "map_file_name: $map_file_name");
159: my $mapping_doc;
160: my $mapping_root;
161:
162: # access type
163: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
164:
165: # index.meta namespace (not really implemented!)
166: my $namespace = "";
167:
168:
169: my $xml_changed = 0;
170: my $errcnt = 0;
171: my $warncnt = 0;
172:
173: #######################################################
174: # check parameters that were passed to the program
175: #
176: my $infile = $$args{'path'};
177: if (! $infile) {
178: logger("ABORT", "no input file given!");
179: exit 1;
180: }
181: # strip double slashes
182: $infile = sstrip($infile, 1);
183: if (! -f $infile) {
184: logger("ABORT", "input file \'$infile\' doesn't exist!");
185: exit 1;
186: }
187:
188:
189: #######################################################
190: # subroutines
191: #
192:
193:
194: sub add_digilib_mapping {
195: my ($src_dir, $dest_dir) = @_;
196: my $elem = $mapping_root->addNewChild($namespace, 'mapping');
197: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
198: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
199: if ($map_file_name) {
200: write_xml($mapping_doc, $map_file_name);
201: } else {
202: logger('ABORT', "unable to write mapping file!");
203: exit 1;
204: }
205: }
206:
207: sub find_digifiles_dir {
208: my ($input_node) = @_;
209: my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
210: my $src_dir = find_online_dir($input_node, $digifiles_base, '');
211: if (! $src_dir) {
212: logger('ERROR', "no online directory for digifiles entry");
213: $errcnt++;
214: return;
215: }
216: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
217: if (! $dest_id) {
218: logger('ERROR', "no ID field for digifiles entry");
219: $errcnt++;
220: return;
221: }
222: my $dir = "$lib_online_dir/library/$dest_id";
223: my $map_dir = "$lib_digilib_path/library/$dest_id";
224: if ($dry_run) {
225: logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
226: add_digilib_mapping($src_dir, "$map_dir/pageimg");
227: return $dir;
228: } else {
229: logger('INFO', "moving $digifiles_base/$src_dir to $dir");
230: logger('DEBUG', "mkdir $dir/pageimg");
231: if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
232: logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
233: if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
234: if (-d "$dir/pageimg") {
235: logger('DEBUG', "directory $dir OK");
236: add_digilib_mapping($src_dir, "$map_dir/pageimg");
237: if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
238: logger('DEBUG', "directory $digifiles_base/$src_dir removed");
239: return $dir;
240: } else {
241: logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
242: $errcnt++;
243: return $dir;
244: }
245: }
246: }
247: }
248: logger('ABORT', "unable to copy directory $src_dir to $dir!");
249: exit 1;
250: }
251: return;
252: }
253:
254: sub find_cw_dir {
255: my ($input_node) = @_;
256: my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
257: my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
258: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
259: if (! $dest_id) {
260: logger('ERROR', "no ID field for einstein-cw entry");
261: $errcnt++;
262: return;
263: }
264: my $dir = "$lib_arch_dir/$dest_id";
265: if ($dry_run) {
266: logger('DEBUG', "would move $cw_base/$src_dir to $dir");
267: return $dir;
268: } else {
269: logger('DEBUG', "moving $cw_base/$src_dir to $dir");
270: if (rename "$cw_base/$src_dir", $dir) {
271: if (-d $dir) {
272: logger('DEBUG', "directory $dir OK");
273: return $dir;
274: }
275: } else {
276: logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
277: exit 1;
278: }
279: }
280: return;
281: }
282:
283: sub find_permanent_dir {
284: my ($input_node) = @_;
285: my $online_id = sstrip($input_node->findvalue("fm:$online_id_field"));
286: # try online_base_dir + online_id first
287: if (($online_base_dir)&&($online_id)) {
288: my $dir = sstrip("$online_base_dir/$online_id", 1);
289: return $dir;
290: }
291: # then online_url
292: my $online_base = '/mpiwg/online/permanent';
293: my $online_dir = find_online_dir($input_node, $online_base, 'pageimg');
294: if ((! $online_dir)) {
295: logger('ERROR', "no ID or URL for online permanent entry");
296: $errcnt++;
297: return;
298: }
299: my $dir = sstrip("$online_base/$online_dir", 1);
300: return $dir;
301: }
302:
303: #
304: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
305: #
306: # Takes the path from the $online_url_field of the $input_node document
307: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
308: # Returns the directory path sans $base_dir if it exists
309: #
310: sub find_online_dir {
311: my ($input_node, $base_dir, $page_dir) = @_;
312: $base_dir = $lib_online_dir unless ($base_dir);
313:
314: my $online_url = $input_node->findvalue("fm:$online_url_field");
315: logger('DEBUG', "checking URL: $online_url");
316: my $online_dir;
317: if ($online_url =~ /fn=permanent\/(.+)/) {
318: # new style digilib URL
319: $online_dir = $1;
320: } elsif ($online_url =~ /\?([^\+]+)\+/) {
321: # old style digilib URL
322: $online_dir = $1;
323: }
324: #logger('DEBUG', "online_dir1: $online_dir");
325: if ($online_dir) {
326: $online_dir =~ s/\/$//; # strip ending slashes
327: if ($page_dir) {
328: # strip page_dir
329: $online_dir =~ s/\/${page_dir}$//;
330: }
331: #logger("DEBUG", "dir: $base_dir/$online_dir");
332: if (-d "$base_dir/$online_dir") {
333: logger('DEBUG', "directory $base_dir/$online_dir exists");
334: return $online_dir;
335: }
336: }
337: return;
338: }
339:
340: sub find_arch_dir {
341: my ($input_node) = @_;
342: my $dir = "";
343:
344: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
345: #logger('DEBUG', "bibdir: $bib_dir");
346: if ($bib_dir) {
347: $dir = "$lib_arch_dir/$bib_dir";
348: if (-d $dir) {
349: logger('DEBUG', "directory $dir exists");
350: return $dir;
351: }
352: }
353: return;
354: }
355:
356:
357: sub convert_bib {
358: my ($input_node, $index_root, $index_doc) = @_;
359: my $cnt = 0;
360: my $type = "";
361: my $type_path = "";
362:
363: # process general stuff first
364: foreach my $n ($input_node->getChildNodes()) {
365: my $name = $n->nodeName();
366: my $val = $n->textContent();
367: #logger('DEBUG', " NODE: $name = '$val'");
368: if (exists $gen_map{$name}) {
369: # is a general field
370: if ($name eq $lang_field) {
371: # language field -> convert to iso code
372: if (exists $lang_map{$val}) {
373: $val = $lang_map{$val};
374: } else {
375: logger('ERROR', "unknown language: $val! skipping...");
376: $errcnt++;
377: return 0;
378: }
379: }
380: create_element_path($gen_map{$name}, $index_root, $namespace)
381: ->appendTextNode($val);
382: $cnt++;
383: } elsif (exists $type_map{$name}) {
384: # is a type field
385: $type_path = $type_map{$name};
386: $type = $val;
387: # check with known types
388: if (exists $subtype_map{$val}) {
389: my $indextype = $subtype_map{$val}->{'_name'};
390: create_element_path("$type_path=$indextype", $index_root, $namespace);
391: $cnt++;
392: } else {
393: logger('ERROR', 'unknown bib type $val! skipping...');
394: $errcnt++;
395: return 0;
396: }
397: }
398: }
399: # process sub type fields
400: if ($type) {
401: foreach my $n ($input_node->getChildNodes()) {
402: my $name = $n->nodeName();
403: my $val = $n->textContent();
404: #logger('DEBUG', " NODE: $name = '$val'");
405: if (exists $subtype_map{$type}->{$name}) {
406: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
407: ->appendTextNode($val);
408: $cnt++;
409: }
410: }
411: }
412: return $cnt;
413: }
414:
415:
416:
417: sub process_all_fm_entries {
418: my ($input_root) = @_;
419: my $cnt = 0;
420:
421: foreach my $n ($input_root->findnodes('fm:ROW')) {
422: logger('INFO', "processing entry $cnt ...");
423: process_fm_entry($n);
424: $cnt++;
425: }
426: }
427:
428:
429: sub process_fm_entry {
430: my ($input_node) = @_;
431: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
432: my $index_root = $index_doc->createElementNS($namespace, 'resource');
433: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
434: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
435: $index_doc->setDocumentElement($index_root);
436: my $derived_from = "";
437:
438: # try to find the document directory
439: my $doc_dir = "";
440: if ($online_mode) {
441: $doc_dir = find_permanent_dir($input_node);
442: $derived_from = find_arch_dir($input_node);
443: } elsif ($cw_mode) {
444: $doc_dir = find_cw_dir($input_node);
445: } elsif ($digifiles_mode) {
446: $doc_dir = find_digifiles_dir($input_node);
447: } else {
448: $doc_dir = find_arch_dir($input_node);
449: }
450: if (! $doc_dir) {
451: logger('ERROR', "document directory not found! skipping...");
452: $errcnt++;
453: return;
454: }
455:
456: # add standard stuff to index.meta
457: my ($docname, $docpath) = split_file_path($doc_dir);
458: # name and date
459: create_text_path('name', $docname, $index_root, $namespace);
460: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
461: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
462: create_text_path('creator', 'digigroup', $index_root, $namespace);
463: create_text_path('description', 'a scanned document', $index_root, $namespace);
464: # acquisition
465: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
466: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
467: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
468: # media
469: create_text_path('media-type', 'image', $index_root, $namespace);
470: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
471: # derived-from
472: if ($derived_from) {
473: create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
474: }
475: # access
476: if ($access_type) {
477: if ($access_type eq "free") {
478: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
479: } else {
480: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
481: create_text_path('name', $access_type, $acc_tag, $namespace);
482: }
483: }
484:
485: # convert bib entries
486: my $cnt = convert_bib($input_node, $index_root, $index_doc);
487: if ($cnt == 0) {
488: # error or nothing to convert
489: logger('ERROR', "no bibliographic metadata!");
490: $errcnt++;
491: return;
492: }
493:
494: # write new index.meta file
495: if ($dry_run) {
496: logger('DEBUG', "would write $doc_dir/index.meta");
497: logger('DEBUG', $index_doc->toString(1));
498: } else {
499: write_xml($index_doc, "$doc_dir/index.meta");
500: }
501:
502: }
503:
504:
505:
506:
507:
508: #######################################################
509: # Main
510: #
511:
512: # load filemaker xml dump
513: my ($input_doc, $input_root) = read_xml($infile);
514: # set namespace prefix
515: my $fm_namespace = $input_root->namespaceURI();
516: $input_root->setNamespace($fm_namespace, 'fm', 1);
517:
518: # create digilib mapping file for digifiles mode
519: if ($digifiles_mode) {
520: $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
521: $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
522: $mapping_doc->setDocumentElement($mapping_root);
523: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
524:
525: }
526:
527: process_all_fm_entries($input_root);
528:
529:
530: logger("INFO", "$warncnt warnings");
531: logger("INFO", "$errcnt errors");
532: if ($errcnt > 0) {
533: logger("ABORT", "there were errors!");
534: exit 1;
535: } else {
536: logger("DONE", "done something successfully!");
537: }
538:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>