1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive_devel';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2.1 (15.8.2005)";
14: my $help =
15: "use: makemeta-lib [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -cw-mode mode for copying einstein_cq documents
20: -digifiles-mode mode for copying files from digifiles
21: -map-file=mapfile.xml digilib mapping file (for digifiles mode)
22: ";
23: logger("INFO", "makemeta-lib $version");
24:
25: ###########################################
26: # mappings
27:
28: # generic mappings at top level
29: my %gen_map = (
30: 'Device' => 'meta/image-acquisition/device',
31: 'Image_Type' => 'meta/image-acquisition/image-type',
32: 'Production_Comment' => 'meta/image-acquisition/production-comment',
33: 'Postproduction' => 'meta/image-acquisition/production-comment',
34: 'Language' => 'meta/lang'
35: );
36: # sub type switch tag
37: my %type_map = (
38: 'Reference_Type' => 'meta/bib@type'
39: );
40: # sub type mappings
41: my %subtype_map = (
42: 'Book' => {
43: '_name' => 'book',
44: 'Author' => 'meta/bib/author',
45: 'Title' => 'meta/bib/title',
46: 'Year' => 'meta/bib/year',
47: 'Place_Published' => 'meta/bib/city',
48: 'Publisher' => 'meta/bib/publisher',
49: 'Edition' => 'meta/bib/edition'
50: },
51: 'Journal Article' => {
52: '_name' => 'journal-article',
53: 'Author' => 'meta/bib/author',
54: 'Title' => 'meta/bib/title',
55: 'Year' => 'meta/bib/year',
56: 'Secondary_Title' => 'meta/bib/journal',
57: 'Volume' => 'meta/bib/volume',
58: 'Number' => 'meta/bib/issue',
59: 'Pages' => 'meta/bib/pages'
60: },
61: 'In Book' => {
62: '_name' => 'inbook',
63: 'Author' => 'meta/bib/author',
64: 'Title' => 'meta/bib/title',
65: 'Year' => 'meta/bib/year',
66: 'Secondary_Title' => 'meta/bib/book-title',
67: 'Pages' => 'meta/bib/pages'
68: },
69: 'Newspaper Article' => {
70: '_name' => 'newspaper-article',
71: 'Author' => 'meta/bib/author',
72: 'Title' => 'meta/bib/title',
73: 'Year' => 'meta/bib/year',
74: 'Secondary_Title' => 'meta/bib/newspaper',
75: 'Place_Published' => 'meta/bib/city',
76: 'Number' => 'meta/bib/issue-date',
77: 'Pages' => 'meta/bib/pages'
78: },
79: 'Edited Book' => {
80: '_name' => 'edited-book',
81: 'Author' => 'meta/bib/editor',
82: 'Title' => 'meta/bib/title',
83: 'Year' => 'meta/bib/year',
84: 'Place_Published' => 'meta/bib/city',
85: 'Publisher' => 'meta/bib/publisher',
86: 'Edition' => 'meta/bib/edition'
87: },
88: 'Manuscript' => {
89: '_name' => 'manuscript',
90: 'Author' => 'meta/bib/author',
91: 'Title' => 'meta/bib/title',
92: 'Year' => 'meta/bib/year',
93: 'Place_Published' => 'meta/bib/location',
94: }
95: );
96: # language element
97: my $lang_field = 'Language';
98: # languages to iso codes
99: my %lang_map = (
100: 'German' => 'de',
101: 'English' => 'en',
102: 'Italian' => 'it',
103: 'French' => 'fr',
104: 'Latin' => 'la',
105: 'Japanese' => 'ja',
106: 'Dutch' => 'nl',
107: 'Spanish' => 'es'
108: );
109: # storage fields
110: my $arch_id_field = 'ID_Archive';
111: my $online_url_field = 'URL';
112: my $online_id_field = 'ID_OnlinePermanent';
113:
114: #######################################################
115: # internal parameters
116: #
117:
118: # storage
119: my $lib_arch_dir = '/mpiwg/archive/data/library';
120: my $lib_online_dir = '/mpiwg/online/permanent';
121: my $lib_digilib_path = 'permanent';
122: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
123:
124: # read command line parameters
125: my $args = MPIWGStor::parseargs;
126: if (! scalar(%$args)) {
127: print $help, "\n";
128: exit 1;
129: }
130:
131: # debug level
132: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
133:
134: # simulate action only
135: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
136: logger('DEBUG', "dry-run: $dry_run");
137:
138: # use einstein-cw mode
139: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
140: logger('DEBUG', "cw_mode: $cw_mode");
141:
142: # use digifiles mode
143: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
144: logger('DEBUG', "digifiles_mode: $digifiles_mode");
145: # digilib mapping file
146: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
147: logger('DEBUG', "map_file_name: $map_file_name");
148: my $mapping_doc;
149: my $mapping_root;
150:
151: # index.meta namespace (not really implemented!)
152: my $namespace = "";
153:
154:
155: my $xml_changed = 0;
156: my $errcnt = 0;
157: my $warncnt = 0;
158:
159: #######################################################
160: # check parameters that were passed to the program
161: #
162: my $infile = $$args{'path'};
163: if (! $infile) {
164: logger("ABORT", "no input file given!");
165: exit 1;
166: }
167: # strip double slashes
168: $infile =~ s/\/\//\//;
169: if (! -f $infile) {
170: logger("ABORT", "input file \'$infile\' doesn't exist!");
171: exit 1;
172: }
173:
174:
175: #######################################################
176: # subroutines
177: #
178:
179:
180: sub add_digilib_mapping {
181: my ($src_dir, $dest_dir) = @_;
182: my $elem = $mapping_root->addNewChild($namespace, 'mapping');
183: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
184: $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
185: if ($map_file_name) {
186: write_xml($mapping_doc, $map_file_name);
187: } else {
188: logger('ABORT', "unable to write mapping file!");
189: exit 1;
190: }
191: }
192:
193: sub find_digifiles_dir {
194: my ($input_node) = @_;
195: my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
196: my $src_dir = find_online_dir($input_node, $digifiles_base, '');
197: if (! $src_dir) {
198: logger('ERROR', "no online directory for digifiles entry");
199: $errcnt++;
200: return;
201: }
202: my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
203: if (! $dest_id) {
204: logger('ERROR', "no ID field for digifiles entry");
205: $errcnt++;
206: return;
207: }
208: my $dir = "$lib_online_dir/library/$dest_id";
209: my $map_dir = "$lib_digilib_path/library/$dest_id";
210: if ($dry_run) {
211: logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
212: add_digilib_mapping($src_dir, "$map_dir/pageimg");
213: return $dir;
214: } else {
215: logger('INFO', "moving $digifiles_base/$src_dir to $dir");
216: logger('DEBUG', "mkdir $dir/pageimg");
217: if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
218: logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
219: if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
220: if (-d "$dir/pageimg") {
221: logger('DEBUG', "directory $dir OK");
222: add_digilib_mapping($src_dir, "$map_dir/pageimg");
223: if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
224: logger('DEBUG', "directory $digifiles_base/$src_dir removed");
225: return $dir;
226: } else {
227: logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
228: $errcnt++;
229: return $dir;
230: }
231: }
232: }
233: }
234: logger('ABORT', "unable to copy directory $src_dir to $dir!");
235: exit 1;
236: }
237: return;
238: }
239:
240: sub find_cw_dir {
241: my ($input_node) = @_;
242: my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
243: my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
244: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
245: if (! $dest_id) {
246: logger('ERROR', "no ID field for einstein-cw entry");
247: $errcnt++;
248: return;
249: }
250: my $dir = "$lib_arch_dir/$dest_id";
251: if ($dry_run) {
252: logger('DEBUG', "would move $cw_base/$src_dir to $dir");
253: return $dir;
254: } else {
255: logger('DEBUG', "moving $cw_base/$src_dir to $dir");
256: if (rename "$cw_base/$src_dir", $dir) {
257: if (-d $dir) {
258: logger('DEBUG', "directory $dir OK");
259: return $dir;
260: }
261: } else {
262: logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
263: exit 1;
264: }
265: }
266: return;
267: }
268:
269: #
270: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
271: #
272: # Takes the path from the $online_url_field of the $input_node document
273: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
274: # Returns the directory path sans $base_dir if it exists
275: #
276: sub find_online_dir {
277: my ($input_node, $base_dir, $page_dir) = @_;
278: $base_dir = $lib_online_dir unless ($base_dir);
279:
280: my $online_url = $input_node->findvalue("fm:$online_url_field");
281: logger('DEBUG', "checking URL: $online_url");
282: my $online_dir;
283: if ($online_url =~ /fn=permanent\/(.+)/) {
284: # new style digilib URL
285: $online_dir = $1;
286: } elsif ($online_url =~ /\?([^\+]+)\+/) {
287: # old style digilib URL
288: $online_dir = $1;
289: }
290: #logger('DEBUG', "online_dir1: $online_dir");
291: if ($online_dir) {
292: if ($page_dir) {
293: $online_dir =~ s/\/${page_dir}$//;
294: }
295: #logger("DEBUG", "dir: $base_dir/$online_dir");
296: if (-d "$base_dir/$online_dir") {
297: logger('DEBUG', "directory $base_dir/$online_dir exists");
298: return $online_dir;
299: }
300: }
301: return;
302: }
303:
304: sub find_arch_dir {
305: my ($input_node) = @_;
306: my $dir = "";
307:
308: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
309: #logger('DEBUG', "bibdir: $bib_dir");
310: if ($bib_dir) {
311: $dir = "$lib_arch_dir/$bib_dir";
312: if (-d $dir) {
313: logger('DEBUG', "directory $dir exists");
314: return $dir;
315: }
316: }
317: return;
318: }
319:
320:
321: sub convert_bib {
322: my ($input_node, $index_root, $index_doc) = @_;
323: my $cnt = 0;
324: my $type = "";
325: my $type_path = "";
326:
327: # process general stuff first
328: foreach my $n ($input_node->getChildNodes()) {
329: my $name = $n->nodeName();
330: my $val = $n->textContent();
331: #logger('DEBUG', " NODE: $name = '$val'");
332: if (exists $gen_map{$name}) {
333: # is a general field
334: if ($name eq $lang_field) {
335: # language field -> convert to iso code
336: if (exists $lang_map{$val}) {
337: $val = $lang_map{$val};
338: } else {
339: logger('ERROR', "unknown language: $val! skipping...");
340: $errcnt++;
341: return 0;
342: }
343: }
344: create_element_path($gen_map{$name}, $index_root, $namespace)
345: ->appendTextNode($val);
346: $cnt++;
347: } elsif (exists $type_map{$name}) {
348: # is a type field
349: $type_path = $type_map{$name};
350: $type = $val;
351: # check with known types
352: if (exists $subtype_map{$val}) {
353: my $indextype = $subtype_map{$val}->{'_name'};
354: create_element_path("$type_path=$indextype", $index_root, $namespace);
355: $cnt++;
356: } else {
357: logger('ERROR', 'unknown bib type $val! skipping...');
358: $errcnt++;
359: return 0;
360: }
361: }
362: }
363: # process sub type fields
364: if ($type) {
365: foreach my $n ($input_node->getChildNodes()) {
366: my $name = $n->nodeName();
367: my $val = $n->textContent();
368: #logger('DEBUG', " NODE: $name = '$val'");
369: if (exists $subtype_map{$type}->{$name}) {
370: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
371: ->appendTextNode($val);
372: $cnt++;
373: }
374: }
375: }
376: return $cnt;
377: }
378:
379:
380:
381: sub process_all_fm_entries {
382: my ($input_root) = @_;
383: my $cnt = 0;
384:
385: foreach my $n ($input_root->findnodes('fm:ROW')) {
386: logger('INFO', "processing entry $cnt ...");
387: process_fm_entry($n);
388: $cnt++;
389: }
390: }
391:
392:
393: sub process_fm_entry {
394: my ($input_node) = @_;
395: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
396: my $index_root = $index_doc->createElementNS($namespace, 'resource');
397: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
398: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
399: $index_doc->setDocumentElement($index_root);
400:
401: # try to find the document directory
402: my $doc_dir = "";
403: if ($cw_mode) {
404: $doc_dir = find_cw_dir($input_node);
405: } elsif ($digifiles_mode) {
406: $doc_dir = find_digifiles_dir($input_node);
407: } else {
408: $doc_dir = find_arch_dir($input_node);
409: }
410: if (! $doc_dir) {
411: logger('ERROR', "document directory not found! skipping...");
412: $errcnt++;
413: return;
414: }
415:
416: # add standard stuff to index.meta
417: my ($docname, $docpath) = split_file_path($doc_dir);
418: # name and date
419: create_text_path('name', $docname, $index_root, $namespace);
420: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
421: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
422: create_text_path('creator', 'digigroup', $index_root, $namespace);
423: create_text_path('description', 'a scanned document', $index_root, $namespace);
424: # acquisition
425: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
426: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
427: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
428: # media
429: create_text_path('media-type', 'image', $index_root, $namespace);
430: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
431:
432: # convert bib entries
433: my $cnt = convert_bib($input_node, $index_root, $index_doc);
434: if ($cnt == 0) {
435: # error or nothing to convert
436: logger('ERROR', "no bibliographic metadata!");
437: $errcnt++;
438: return;
439: }
440:
441: # write new index.meta file
442: if ($dry_run) {
443: logger('DEBUG', "would write $doc_dir/index.meta");
444: logger('DEBUG', $index_doc->toString(1));
445: } else {
446: write_xml($index_doc, "$doc_dir/index.meta");
447: }
448:
449: }
450:
451:
452:
453:
454:
455: #######################################################
456: # Main
457: #
458:
459: # load filemaker xml dump
460: my ($input_doc, $input_root) = read_xml($infile);
461: # set namespace prefix
462: my $fm_namespace = $input_root->namespaceURI();
463: $input_root->setNamespace($fm_namespace, 'fm', 1);
464:
465: # create digilib mapping file for digifiles mode
466: if ($digifiles_mode) {
467: $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
468: $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
469: $mapping_doc->setDocumentElement($mapping_root);
470: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
471:
472: }
473:
474: process_all_fm_entries($input_root);
475:
476:
477: logger("INFO", "$warncnt warnings");
478: logger("INFO", "$errcnt errors");
479: if ($errcnt > 0) {
480: logger("ABORT", "there were errors!");
481: exit 1;
482: } else {
483: logger("DONE", "done something successfully!");
484: }
485:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>