Annotation of foxridge-archiver/makemeta-lib.pl, revision 1.7
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.7 ! casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.6 casties 13: my $version = "0.2.2 (31.8.2005 ROC)";
1.5 casties 14: my $help =
15: "use: makemeta-lib [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
1.6 casties 19: -online-mode mode for creating online/permanent files
20: -cw-mode mode for copying einstein_cw archive documents
1.5 casties 21: -digifiles-mode mode for copying files from digifiles
22: -map-file=mapfile.xml digilib mapping file (for digifiles mode)
23: ";
1.1 casties 24: logger("INFO", "makemeta-lib $version");
25:
1.2 casties 26: ###########################################
1.1 casties 27: # mappings
1.2 casties 28:
1.1 casties 29: # generic mappings at top level
30: my %gen_map = (
31: 'Device' => 'meta/image-acquisition/device',
32: 'Image_Type' => 'meta/image-acquisition/image-type',
33: 'Production_Comment' => 'meta/image-acquisition/production-comment',
34: 'Postproduction' => 'meta/image-acquisition/production-comment',
35: 'Language' => 'meta/lang'
36: );
37: # sub type switch tag
38: my %type_map = (
39: 'Reference_Type' => 'meta/bib@type'
40: );
41: # sub type mappings
42: my %subtype_map = (
43: 'Book' => {
44: '_name' => 'book',
45: 'Author' => 'meta/bib/author',
46: 'Title' => 'meta/bib/title',
47: 'Year' => 'meta/bib/year',
48: 'Place_Published' => 'meta/bib/city',
49: 'Publisher' => 'meta/bib/publisher',
50: 'Edition' => 'meta/bib/edition'
51: },
52: 'Journal Article' => {
53: '_name' => 'journal-article',
54: 'Author' => 'meta/bib/author',
55: 'Title' => 'meta/bib/title',
56: 'Year' => 'meta/bib/year',
57: 'Secondary_Title' => 'meta/bib/journal',
58: 'Volume' => 'meta/bib/volume',
59: 'Number' => 'meta/bib/issue',
60: 'Pages' => 'meta/bib/pages'
61: },
62: 'In Book' => {
63: '_name' => 'inbook',
64: 'Author' => 'meta/bib/author',
65: 'Title' => 'meta/bib/title',
66: 'Year' => 'meta/bib/year',
67: 'Secondary_Title' => 'meta/bib/book-title',
68: 'Pages' => 'meta/bib/pages'
69: },
70: 'Newspaper Article' => {
71: '_name' => 'newspaper-article',
72: 'Author' => 'meta/bib/author',
73: 'Title' => 'meta/bib/title',
74: 'Year' => 'meta/bib/year',
75: 'Secondary_Title' => 'meta/bib/newspaper',
76: 'Place_Published' => 'meta/bib/city',
77: 'Number' => 'meta/bib/issue-date',
78: 'Pages' => 'meta/bib/pages'
79: },
80: 'Edited Book' => {
81: '_name' => 'edited-book',
82: 'Author' => 'meta/bib/editor',
83: 'Title' => 'meta/bib/title',
84: 'Year' => 'meta/bib/year',
85: 'Place_Published' => 'meta/bib/city',
86: 'Publisher' => 'meta/bib/publisher',
87: 'Edition' => 'meta/bib/edition'
88: },
89: 'Manuscript' => {
90: '_name' => 'manuscript',
91: 'Author' => 'meta/bib/author',
92: 'Title' => 'meta/bib/title',
93: 'Year' => 'meta/bib/year',
94: 'Place_Published' => 'meta/bib/location',
95: }
96: );
97: # language element
98: my $lang_field = 'Language';
99: # languages to iso codes
100: my %lang_map = (
101: 'German' => 'de',
102: 'English' => 'en',
103: 'Italian' => 'it',
104: 'French' => 'fr',
1.2 casties 105: 'Latin' => 'la',
1.3 casties 106: 'Japanese' => 'ja',
1.4 casties 107: 'Dutch' => 'nl',
1.2 casties 108: 'Spanish' => 'es'
1.1 casties 109: );
110: # storage fields
111: my $arch_id_field = 'ID_Archive';
112: my $online_url_field = 'URL';
1.5 casties 113: my $online_id_field = 'ID_OnlinePermanent';
1.1 casties 114:
1.2 casties 115: #######################################################
116: # internal parameters
117: #
118:
119: # storage
1.1 casties 120: my $lib_arch_dir = '/mpiwg/archive/data/library';
121: my $lib_online_dir = '/mpiwg/online/permanent';
1.5 casties 122: my $lib_digilib_path = 'permanent';
123: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
1.1 casties 124:
125: # read command line parameters
126: my $args = MPIWGStor::parseargs;
1.5 casties 127: if (! scalar(%$args)) {
128: print $help, "\n";
129: exit 1;
130: }
1.1 casties 131:
132: # debug level
133: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
134:
1.5 casties 135: # simulate action only
136: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
137: logger('DEBUG', "dry-run: $dry_run");
138:
1.6 casties 139: # use online mode
140: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
141: logger('DEBUG', "online_mode: $online_mode");
142:
1.1 casties 143: # use einstein-cw mode
144: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
1.5 casties 145: logger('DEBUG', "cw_mode: $cw_mode");
146:
147: # use digifiles mode
148: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
149: logger('DEBUG', "digifiles_mode: $digifiles_mode");
150: # digilib mapping file
151: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
152: logger('DEBUG', "map_file_name: $map_file_name");
153: my $mapping_doc;
154: my $mapping_root;
1.1 casties 155:
156: # index.meta namespace (not really implemented!)
157: my $namespace = "";
158:
159:
160: my $xml_changed = 0;
161: my $errcnt = 0;
162: my $warncnt = 0;
163:
164: #######################################################
165: # check parameters that were passed to the program
166: #
167: my $infile = $$args{'path'};
168: if (! $infile) {
169: logger("ABORT", "no input file given!");
170: exit 1;
171: }
172: # strip double slashes
173: $infile =~ s/\/\//\//;
174: if (! -f $infile) {
175: logger("ABORT", "input file \'$infile\' doesn't exist!");
176: exit 1;
177: }
178:
179:
180: #######################################################
181: # subroutines
182: #
183:
1.5 casties 184:
185: sub add_digilib_mapping {
186: my ($src_dir, $dest_dir) = @_;
187: my $elem = $mapping_root->addNewChild($namespace, 'mapping');
188: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
189: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
190: if ($map_file_name) {
191: write_xml($mapping_doc, $map_file_name);
192: } else {
193: logger('ABORT', "unable to write mapping file!");
194: exit 1;
195: }
196: }
197:
198: sub find_digifiles_dir {
199: my ($input_node) = @_;
200: my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
201: my $src_dir = find_online_dir($input_node, $digifiles_base, '');
202: if (! $src_dir) {
203: logger('ERROR', "no online directory for digifiles entry");
204: $errcnt++;
205: return;
206: }
207: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
208: if (! $dest_id) {
209: logger('ERROR', "no ID field for digifiles entry");
210: $errcnt++;
211: return;
212: }
213: my $dir = "$lib_online_dir/library/$dest_id";
214: my $map_dir = "$lib_digilib_path/library/$dest_id";
215: if ($dry_run) {
216: logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
217: add_digilib_mapping($src_dir, "$map_dir/pageimg");
218: return $dir;
219: } else {
220: logger('INFO', "moving $digifiles_base/$src_dir to $dir");
221: logger('DEBUG', "mkdir $dir/pageimg");
222: if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
223: logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
224: if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
225: if (-d "$dir/pageimg") {
226: logger('DEBUG', "directory $dir OK");
227: add_digilib_mapping($src_dir, "$map_dir/pageimg");
228: if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
229: logger('DEBUG', "directory $digifiles_base/$src_dir removed");
230: return $dir;
231: } else {
232: logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
233: $errcnt++;
234: return $dir;
235: }
236: }
237: }
238: }
239: logger('ABORT', "unable to copy directory $src_dir to $dir!");
240: exit 1;
241: }
242: return;
243: }
244:
1.1 casties 245: sub find_cw_dir {
246: my ($input_node) = @_;
1.5 casties 247: my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
248: my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
249: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
1.1 casties 250: if (! $dest_id) {
251: logger('ERROR', "no ID field for einstein-cw entry");
252: $errcnt++;
253: return;
254: }
255: my $dir = "$lib_arch_dir/$dest_id";
1.5 casties 256: if ($dry_run) {
257: logger('DEBUG', "would move $cw_base/$src_dir to $dir");
258: return $dir;
259: } else {
260: logger('DEBUG', "moving $cw_base/$src_dir to $dir");
261: if (rename "$cw_base/$src_dir", $dir) {
262: if (-d $dir) {
263: logger('DEBUG', "directory $dir OK");
264: return $dir;
265: }
266: } else {
267: logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
268: exit 1;
1.1 casties 269: }
270: }
271: return;
272: }
273:
1.6 casties 274: sub find_permanent_dir {
275: my ($input_node) = @_;
276: my $online_base = '/mpiwg/online/permanent';
277: my $src_dir = find_online_dir($input_node, $online_base, 'pageimg');
278: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
279: if (! $dest_id) {
280: logger('ERROR', "no ID field for online permanent entry");
281: $errcnt++;
282: return;
283: }
284: my $dir = "$online_base/$src_dir";
285: return $dir;
286: }
287:
1.5 casties 288: #
289: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
290: #
291: # Takes the path from the $online_url_field of the $input_node document
292: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
293: # Returns the directory path sans $base_dir if it exists
294: #
1.1 casties 295: sub find_online_dir {
1.5 casties 296: my ($input_node, $base_dir, $page_dir) = @_;
1.1 casties 297: $base_dir = $lib_online_dir unless ($base_dir);
298:
299: my $online_url = $input_node->findvalue("fm:$online_url_field");
1.5 casties 300: logger('DEBUG', "checking URL: $online_url");
301: my $online_dir;
302: if ($online_url =~ /fn=permanent\/(.+)/) {
303: # new style digilib URL
304: $online_dir = $1;
305: } elsif ($online_url =~ /\?([^\+]+)\+/) {
306: # old style digilib URL
307: $online_dir = $1;
308: }
309: #logger('DEBUG', "online_dir1: $online_dir");
310: if ($online_dir) {
1.6 casties 311: $online_dir =~ s/\/$//; # strip ending slashes
1.5 casties 312: if ($page_dir) {
313: $online_dir =~ s/\/${page_dir}$//;
314: }
1.1 casties 315: #logger("DEBUG", "dir: $base_dir/$online_dir");
1.5 casties 316: if (-d "$base_dir/$online_dir") {
1.6 casties 317: logger('DEBUG', "directory $base_dir/$online_dir exists");
1.5 casties 318: return $online_dir;
1.1 casties 319: }
320: }
321: return;
322: }
323:
324: sub find_arch_dir {
325: my ($input_node) = @_;
326: my $dir = "";
327:
328: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
329: #logger('DEBUG', "bibdir: $bib_dir");
330: if ($bib_dir) {
331: $dir = "$lib_arch_dir/$bib_dir";
332: if (-d $dir) {
333: logger('DEBUG', "directory $dir exists");
334: return $dir;
335: }
336: }
337: return;
338: }
339:
340:
341: sub convert_bib {
342: my ($input_node, $index_root, $index_doc) = @_;
343: my $cnt = 0;
344: my $type = "";
345: my $type_path = "";
346:
347: # process general stuff first
348: foreach my $n ($input_node->getChildNodes()) {
349: my $name = $n->nodeName();
350: my $val = $n->textContent();
351: #logger('DEBUG', " NODE: $name = '$val'");
352: if (exists $gen_map{$name}) {
353: # is a general field
354: if ($name eq $lang_field) {
355: # language field -> convert to iso code
356: if (exists $lang_map{$val}) {
357: $val = $lang_map{$val};
358: } else {
359: logger('ERROR', "unknown language: $val! skipping...");
360: $errcnt++;
361: return 0;
362: }
363: }
364: create_element_path($gen_map{$name}, $index_root, $namespace)
365: ->appendTextNode($val);
366: $cnt++;
367: } elsif (exists $type_map{$name}) {
368: # is a type field
369: $type_path = $type_map{$name};
370: $type = $val;
371: # check with known types
372: if (exists $subtype_map{$val}) {
373: my $indextype = $subtype_map{$val}->{'_name'};
374: create_element_path("$type_path=$indextype", $index_root, $namespace);
375: $cnt++;
376: } else {
377: logger('ERROR', 'unknown bib type $val! skipping...');
378: $errcnt++;
379: return 0;
380: }
381: }
382: }
383: # process sub type fields
384: if ($type) {
385: foreach my $n ($input_node->getChildNodes()) {
386: my $name = $n->nodeName();
387: my $val = $n->textContent();
388: #logger('DEBUG', " NODE: $name = '$val'");
389: if (exists $subtype_map{$type}->{$name}) {
390: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
391: ->appendTextNode($val);
392: $cnt++;
393: }
394: }
395: }
396: return $cnt;
397: }
398:
399:
400:
401: sub process_all_fm_entries {
402: my ($input_root) = @_;
403: my $cnt = 0;
404:
405: foreach my $n ($input_root->findnodes('fm:ROW')) {
406: logger('INFO', "processing entry $cnt ...");
407: process_fm_entry($n);
1.5 casties 408: $cnt++;
1.1 casties 409: }
410: }
411:
412:
413: sub process_fm_entry {
414: my ($input_node) = @_;
415: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
416: my $index_root = $index_doc->createElementNS($namespace, 'resource');
417: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
418: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
419: $index_doc->setDocumentElement($index_root);
420:
421: # try to find the document directory
422: my $doc_dir = "";
1.6 casties 423: if ($online_mode) {
424: $doc_dir = find_permanent_dir($input_node);
425: } elsif ($cw_mode) {
1.1 casties 426: $doc_dir = find_cw_dir($input_node);
1.5 casties 427: } elsif ($digifiles_mode) {
428: $doc_dir = find_digifiles_dir($input_node);
1.1 casties 429: } else {
430: $doc_dir = find_arch_dir($input_node);
431: }
432: if (! $doc_dir) {
433: logger('ERROR', "document directory not found! skipping...");
434: $errcnt++;
435: return;
436: }
437:
438: # add standard stuff to index.meta
439: my ($docname, $docpath) = split_file_path($doc_dir);
440: # name and date
441: create_text_path('name', $docname, $index_root, $namespace);
442: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
443: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
444: create_text_path('creator', 'digigroup', $index_root, $namespace);
445: create_text_path('description', 'a scanned document', $index_root, $namespace);
446: # acquisition
447: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
448: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
449: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
450: # media
451: create_text_path('media-type', 'image', $index_root, $namespace);
452: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
453:
454: # convert bib entries
455: my $cnt = convert_bib($input_node, $index_root, $index_doc);
456: if ($cnt == 0) {
457: # error or nothing to convert
458: logger('ERROR', "no bibliographic metadata!");
459: $errcnt++;
460: return;
461: }
462:
463: # write new index.meta file
1.5 casties 464: if ($dry_run) {
465: logger('DEBUG', "would write $doc_dir/index.meta");
466: logger('DEBUG', $index_doc->toString(1));
467: } else {
468: write_xml($index_doc, "$doc_dir/index.meta");
469: }
1.1 casties 470:
471: }
472:
473:
474:
475:
476:
477: #######################################################
478: # Main
479: #
480:
481: # load filemaker xml dump
482: my ($input_doc, $input_root) = read_xml($infile);
483: # set namespace prefix
484: my $fm_namespace = $input_root->namespaceURI();
485: $input_root->setNamespace($fm_namespace, 'fm', 1);
1.5 casties 486:
487: # create digilib mapping file for digifiles mode
488: if ($digifiles_mode) {
489: $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
490: $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
491: $mapping_doc->setDocumentElement($mapping_root);
492: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
493:
494: }
1.1 casties 495:
496: process_all_fm_entries($input_root);
497:
498:
499: logger("INFO", "$warncnt warnings");
500: logger("INFO", "$errcnt errors");
501: if ($errcnt > 0) {
502: logger("ABORT", "there were errors!");
503: exit 1;
504: } else {
505: logger("DONE", "done something successfully!");
506: }
507:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>