1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.1.1 (1.6.2005)";
14: logger("INFO", "makemeta-lib $version");
15:
16: ###########################################
17: # mappings
18:
19: # generic mappings at top level
20: my %gen_map = (
21: 'Device' => 'meta/image-acquisition/device',
22: 'Image_Type' => 'meta/image-acquisition/image-type',
23: 'Production_Comment' => 'meta/image-acquisition/production-comment',
24: 'Postproduction' => 'meta/image-acquisition/production-comment',
25: 'Language' => 'meta/lang'
26: );
27: # sub type switch tag
28: my %type_map = (
29: 'Reference_Type' => 'meta/bib@type'
30: );
31: # sub type mappings
32: my %subtype_map = (
33: 'Book' => {
34: '_name' => 'book',
35: 'Author' => 'meta/bib/author',
36: 'Title' => 'meta/bib/title',
37: 'Year' => 'meta/bib/year',
38: 'Place_Published' => 'meta/bib/city',
39: 'Publisher' => 'meta/bib/publisher',
40: 'Edition' => 'meta/bib/edition'
41: },
42: 'Journal Article' => {
43: '_name' => 'journal-article',
44: 'Author' => 'meta/bib/author',
45: 'Title' => 'meta/bib/title',
46: 'Year' => 'meta/bib/year',
47: 'Secondary_Title' => 'meta/bib/journal',
48: 'Volume' => 'meta/bib/volume',
49: 'Number' => 'meta/bib/issue',
50: 'Pages' => 'meta/bib/pages'
51: },
52: 'In Book' => {
53: '_name' => 'inbook',
54: 'Author' => 'meta/bib/author',
55: 'Title' => 'meta/bib/title',
56: 'Year' => 'meta/bib/year',
57: 'Secondary_Title' => 'meta/bib/book-title',
58: 'Pages' => 'meta/bib/pages'
59: },
60: 'Newspaper Article' => {
61: '_name' => 'newspaper-article',
62: 'Author' => 'meta/bib/author',
63: 'Title' => 'meta/bib/title',
64: 'Year' => 'meta/bib/year',
65: 'Secondary_Title' => 'meta/bib/newspaper',
66: 'Place_Published' => 'meta/bib/city',
67: 'Number' => 'meta/bib/issue-date',
68: 'Pages' => 'meta/bib/pages'
69: },
70: 'Edited Book' => {
71: '_name' => 'edited-book',
72: 'Author' => 'meta/bib/editor',
73: 'Title' => 'meta/bib/title',
74: 'Year' => 'meta/bib/year',
75: 'Place_Published' => 'meta/bib/city',
76: 'Publisher' => 'meta/bib/publisher',
77: 'Edition' => 'meta/bib/edition'
78: },
79: 'Manuscript' => {
80: '_name' => 'manuscript',
81: 'Author' => 'meta/bib/author',
82: 'Title' => 'meta/bib/title',
83: 'Year' => 'meta/bib/year',
84: 'Place_Published' => 'meta/bib/location',
85: }
86: );
87: # language element
88: my $lang_field = 'Language';
89: # languages to iso codes
90: my %lang_map = (
91: 'German' => 'de',
92: 'English' => 'en',
93: 'Italian' => 'it',
94: 'French' => 'fr',
95: 'Latin' => 'la',
96: 'Japanese' => 'ja',
97: 'Dutch' => 'nl',
98: 'Spanish' => 'es'
99: );
100: # storage fields
101: my $arch_id_field = 'ID_Archive';
102: my $online_url_field = 'URL';
103:
104: #######################################################
105: # internal parameters
106: #
107:
108: # storage
109: my $lib_arch_dir = '/mpiwg/archive/data/library';
110: my $lib_online_dir = '/mpiwg/online/permanent';
111:
112: # read command line parameters
113: my $args = MPIWGStor::parseargs;
114:
115: # debug level
116: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
117:
118: # use einstein-cw mode
119: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
120:
121: # index.meta namespace (not really implemented!)
122: my $namespace = "";
123:
124:
125: my $xml_changed = 0;
126: my $errcnt = 0;
127: my $warncnt = 0;
128:
129: #######################################################
130: # check parameters that were passed to the program
131: #
132: my $infile = $$args{'path'};
133: if (! $infile) {
134: logger("ABORT", "no input file given!");
135: exit 1;
136: }
137: # strip double slashes
138: $infile =~ s/\/\//\//;
139: if (! -f $infile) {
140: logger("ABORT", "input file \'$infile\' doesn't exist!");
141: exit 1;
142: }
143:
144:
145: #######################################################
146: # subroutines
147: #
148:
149: sub find_cw_dir {
150: my ($input_node) = @_;
151: my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup');
152: my $dest_id = $input_node->findvalue("fm:$arch_id_field");
153: if (! $dest_id) {
154: logger('ERROR', "no ID field for einstein-cw entry");
155: $errcnt++;
156: return;
157: }
158: my $dir = "$lib_arch_dir/$dest_id";
159: logger('DEBUG', "moving $src_dir to $dir");
160: if (rename $src_dir, $dir) {
161: if (-d $dir) {
162: logger('DEBUG', "directory $dir OK");
163: return $dir;
164: }
165: } else {
166: logger('ABORT', "unable to rename directory $src_dir to $dir!");
167: exit 1;
168: }
169: return;
170: }
171:
172: sub find_online_dir {
173: my ($input_node, $base_dir) = @_;
174: $base_dir = $lib_online_dir unless ($base_dir);
175:
176: my $online_url = $input_node->findvalue("fm:$online_url_field");
177: if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) {
178: my $online_dir = $1;
179: #logger("DEBUG", "dir: $base_dir/$online_dir");
180: my $dir = "$base_dir/$online_dir";
181: if (-d $dir) {
182: logger('DEBUG', "directory $dir exists");
183: return $dir;
184: }
185: }
186: return;
187: }
188:
189: sub find_arch_dir {
190: my ($input_node) = @_;
191: my $dir = "";
192:
193: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
194: #logger('DEBUG', "bibdir: $bib_dir");
195: if ($bib_dir) {
196: $dir = "$lib_arch_dir/$bib_dir";
197: if (-d $dir) {
198: logger('DEBUG', "directory $dir exists");
199: return $dir;
200: }
201: }
202: return;
203: }
204:
205:
206: sub convert_bib {
207: my ($input_node, $index_root, $index_doc) = @_;
208: my $cnt = 0;
209: my $type = "";
210: my $type_path = "";
211:
212: # process general stuff first
213: foreach my $n ($input_node->getChildNodes()) {
214: my $name = $n->nodeName();
215: my $val = $n->textContent();
216: #logger('DEBUG', " NODE: $name = '$val'");
217: if (exists $gen_map{$name}) {
218: # is a general field
219: if ($name eq $lang_field) {
220: # language field -> convert to iso code
221: if (exists $lang_map{$val}) {
222: $val = $lang_map{$val};
223: } else {
224: logger('ERROR', "unknown language: $val! skipping...");
225: $errcnt++;
226: return 0;
227: }
228: }
229: create_element_path($gen_map{$name}, $index_root, $namespace)
230: ->appendTextNode($val);
231: $cnt++;
232: } elsif (exists $type_map{$name}) {
233: # is a type field
234: $type_path = $type_map{$name};
235: $type = $val;
236: # check with known types
237: if (exists $subtype_map{$val}) {
238: my $indextype = $subtype_map{$val}->{'_name'};
239: create_element_path("$type_path=$indextype", $index_root, $namespace);
240: $cnt++;
241: } else {
242: logger('ERROR', 'unknown bib type $val! skipping...');
243: $errcnt++;
244: return 0;
245: }
246: }
247: }
248: # process sub type fields
249: if ($type) {
250: foreach my $n ($input_node->getChildNodes()) {
251: my $name = $n->nodeName();
252: my $val = $n->textContent();
253: #logger('DEBUG', " NODE: $name = '$val'");
254: if (exists $subtype_map{$type}->{$name}) {
255: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
256: ->appendTextNode($val);
257: $cnt++;
258: }
259: }
260: }
261: return $cnt;
262: }
263:
264:
265:
266: sub process_all_fm_entries {
267: my ($input_root) = @_;
268: my $cnt = 0;
269:
270: foreach my $n ($input_root->findnodes('fm:ROW')) {
271: logger('INFO', "processing entry $cnt ...");
272: process_fm_entry($n);
273: }
274: }
275:
276:
277: sub process_fm_entry {
278: my ($input_node) = @_;
279: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
280: my $index_root = $index_doc->createElementNS($namespace, 'resource');
281: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
282: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
283: $index_doc->setDocumentElement($index_root);
284:
285: # try to find the document directory
286: my $doc_dir = "";
287: if ($cw_mode) {
288: $doc_dir = find_cw_dir($input_node);
289: } else {
290: $doc_dir = find_arch_dir($input_node);
291: }
292: if (! $doc_dir) {
293: logger('ERROR', "document directory not found! skipping...");
294: $errcnt++;
295: return;
296: }
297:
298: # add standard stuff to index.meta
299: my ($docname, $docpath) = split_file_path($doc_dir);
300: # name and date
301: create_text_path('name', $docname, $index_root, $namespace);
302: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
303: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
304: create_text_path('creator', 'digigroup', $index_root, $namespace);
305: create_text_path('description', 'a scanned document', $index_root, $namespace);
306: # acquisition
307: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
308: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
309: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
310: # media
311: create_text_path('media-type', 'image', $index_root, $namespace);
312: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
313:
314: # convert bib entries
315: my $cnt = convert_bib($input_node, $index_root, $index_doc);
316: if ($cnt == 0) {
317: # error or nothing to convert
318: logger('ERROR', "no bibliographic metadata!");
319: $errcnt++;
320: return;
321: }
322:
323: # write new index.meta file
324: write_xml($index_doc, "$doc_dir/index.meta");
325:
326: }
327:
328:
329:
330:
331:
332: #######################################################
333: # Main
334: #
335:
336: # load filemaker xml dump
337: my ($input_doc, $input_root) = read_xml($infile);
338: # set namespace prefix
339: my $fm_namespace = $input_root->namespaceURI();
340: $input_root->setNamespace($fm_namespace, 'fm', 1);
341:
342: process_all_fm_entries($input_root);
343:
344:
345: logger("INFO", "$warncnt warnings");
346: logger("INFO", "$errcnt errors");
347: if ($errcnt > 0) {
348: logger("ABORT", "there were errors!");
349: exit 1;
350: } else {
351: logger("DONE", "done something successfully!");
352: }
353:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>