1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: #######################################################
13: # internal parameters
14: #
15:
16: # program version
17: my $version = "0.1.0 (24.5.2005)";
18: logger("INFO", "makemeta-lib $version");
19:
20: #
21: # mappings
22: #
23: # generic mappings at top level
24: my %gen_map = (
25: 'Device' => 'meta/image-acquisition/device',
26: 'Image_Type' => 'meta/image-acquisition/image-type',
27: 'Production_Comment' => 'meta/image-acquisition/production-comment',
28: 'Postproduction' => 'meta/image-acquisition/production-comment',
29: 'Language' => 'meta/lang'
30: );
31: # sub type switch tag
32: my %type_map = (
33: 'Reference_Type' => 'meta/bib@type'
34: );
35: # sub type mappings
36: my %subtype_map = (
37: 'Book' => {
38: '_name' => 'book',
39: 'Author' => 'meta/bib/author',
40: 'Title' => 'meta/bib/title',
41: 'Year' => 'meta/bib/year',
42: 'Place_Published' => 'meta/bib/city',
43: 'Publisher' => 'meta/bib/publisher',
44: 'Edition' => 'meta/bib/edition'
45: },
46: 'Journal Article' => {
47: '_name' => 'journal-article',
48: 'Author' => 'meta/bib/author',
49: 'Title' => 'meta/bib/title',
50: 'Year' => 'meta/bib/year',
51: 'Secondary_Title' => 'meta/bib/journal',
52: 'Volume' => 'meta/bib/volume',
53: 'Number' => 'meta/bib/issue',
54: 'Pages' => 'meta/bib/pages'
55: },
56: 'In Book' => {
57: '_name' => 'inbook',
58: 'Author' => 'meta/bib/author',
59: 'Title' => 'meta/bib/title',
60: 'Year' => 'meta/bib/year',
61: 'Secondary_Title' => 'meta/bib/book-title',
62: 'Pages' => 'meta/bib/pages'
63: },
64: 'Newspaper Article' => {
65: '_name' => 'newspaper-article',
66: 'Author' => 'meta/bib/author',
67: 'Title' => 'meta/bib/title',
68: 'Year' => 'meta/bib/year',
69: 'Secondary_Title' => 'meta/bib/newspaper',
70: 'Place_Published' => 'meta/bib/city',
71: 'Number' => 'meta/bib/issue-date',
72: 'Pages' => 'meta/bib/pages'
73: },
74: 'Edited Book' => {
75: '_name' => 'edited-book',
76: 'Author' => 'meta/bib/editor',
77: 'Title' => 'meta/bib/title',
78: 'Year' => 'meta/bib/year',
79: 'Place_Published' => 'meta/bib/city',
80: 'Publisher' => 'meta/bib/publisher',
81: 'Edition' => 'meta/bib/edition'
82: },
83: 'Manuscript' => {
84: '_name' => 'manuscript',
85: 'Author' => 'meta/bib/author',
86: 'Title' => 'meta/bib/title',
87: 'Year' => 'meta/bib/year',
88: 'Place_Published' => 'meta/bib/location',
89: }
90: );
91: # language element
92: my $lang_field = 'Language';
93: # languages to iso codes
94: my %lang_map = (
95: 'German' => 'de',
96: 'English' => 'en',
97: 'Italian' => 'it',
98: 'French' => 'fr',
99: 'Latin' => 'la'
100: );
101: # storage fields
102: my $arch_id_field = 'ID_Archive';
103: my $online_url_field = 'URL';
104:
105: # more storage
106: my $lib_arch_dir = '/mpiwg/archive/data/library';
107: my $lib_online_dir = '/mpiwg/online/permanent';
108:
109:
110: # read command line parameters
111: my $args = MPIWGStor::parseargs;
112:
113: # debug level
114: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
115:
116: # use einstein-cw mode
117: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
118:
119: # index.meta namespace (not really implemented!)
120: my $namespace = "";
121:
122:
123: my $xml_changed = 0;
124: my $errcnt = 0;
125: my $warncnt = 0;
126:
127: #######################################################
128: # check parameters that were passed to the program
129: #
130: my $infile = $$args{'path'};
131: if (! $infile) {
132: logger("ABORT", "no input file given!");
133: exit 1;
134: }
135: # strip double slashes
136: $infile =~ s/\/\//\//;
137: if (! -f $infile) {
138: logger("ABORT", "input file \'$infile\' doesn't exist!");
139: exit 1;
140: }
141:
142:
143: #######################################################
144: # subroutines
145: #
146:
147: sub find_cw_dir {
148: my ($input_node) = @_;
149: my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup');
150: my $dest_id = $input_node->findvalue("fm:$arch_id_field");
151: if (! $dest_id) {
152: logger('ERROR', "no ID field for einstein-cw entry");
153: $errcnt++;
154: return;
155: }
156: my $dir = "$lib_arch_dir/$dest_id";
157: logger('DEBUG', "moving $src_dir to $dir");
158: if (rename $src_dir, $dir) {
159: if (-d $dir) {
160: logger('DEBUG', "directory $dir OK");
161: return $dir;
162: }
163: } else {
164: logger('ABORT', "unable to rename directory $src_dir to $dir!");
165: exit 1;
166: }
167: return;
168: }
169:
170: sub find_online_dir {
171: my ($input_node, $base_dir) = @_;
172: $base_dir = $lib_online_dir unless ($base_dir);
173:
174: my $online_url = $input_node->findvalue("fm:$online_url_field");
175: if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) {
176: my $online_dir = $1;
177: #logger("DEBUG", "dir: $base_dir/$online_dir");
178: my $dir = "$base_dir/$online_dir";
179: if (-d $dir) {
180: logger('DEBUG', "directory $dir exists");
181: return $dir;
182: }
183: }
184: return;
185: }
186:
187: sub find_arch_dir {
188: my ($input_node) = @_;
189: my $dir = "";
190:
191: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
192: #logger('DEBUG', "bibdir: $bib_dir");
193: if ($bib_dir) {
194: $dir = "$lib_arch_dir/$bib_dir";
195: if (-d $dir) {
196: logger('DEBUG', "directory $dir exists");
197: return $dir;
198: }
199: }
200: return;
201: }
202:
203:
204: sub convert_bib {
205: my ($input_node, $index_root, $index_doc) = @_;
206: my $cnt = 0;
207: my $type = "";
208: my $type_path = "";
209:
210: # process general stuff first
211: foreach my $n ($input_node->getChildNodes()) {
212: my $name = $n->nodeName();
213: my $val = $n->textContent();
214: #logger('DEBUG', " NODE: $name = '$val'");
215: if (exists $gen_map{$name}) {
216: # is a general field
217: if ($name eq $lang_field) {
218: # language field -> convert to iso code
219: if (exists $lang_map{$val}) {
220: $val = $lang_map{$val};
221: } else {
222: logger('ERROR', "unknown language: $val! skipping...");
223: $errcnt++;
224: return 0;
225: }
226: }
227: create_element_path($gen_map{$name}, $index_root, $namespace)
228: ->appendTextNode($val);
229: $cnt++;
230: } elsif (exists $type_map{$name}) {
231: # is a type field
232: $type_path = $type_map{$name};
233: $type = $val;
234: # check with known types
235: if (exists $subtype_map{$val}) {
236: my $indextype = $subtype_map{$val}->{'_name'};
237: create_element_path("$type_path=$indextype", $index_root, $namespace);
238: $cnt++;
239: } else {
240: logger('ERROR', 'unknown bib type $val! skipping...');
241: $errcnt++;
242: return 0;
243: }
244: }
245: }
246: # process sub type fields
247: if ($type) {
248: foreach my $n ($input_node->getChildNodes()) {
249: my $name = $n->nodeName();
250: my $val = $n->textContent();
251: #logger('DEBUG', " NODE: $name = '$val'");
252: if (exists $subtype_map{$type}->{$name}) {
253: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
254: ->appendTextNode($val);
255: $cnt++;
256: }
257: }
258: }
259: return $cnt;
260: }
261:
262:
263:
264: sub process_all_fm_entries {
265: my ($input_root) = @_;
266: my $cnt = 0;
267:
268: foreach my $n ($input_root->findnodes('fm:ROW')) {
269: logger('INFO', "processing entry $cnt ...");
270: process_fm_entry($n);
271: }
272: }
273:
274:
275: sub process_fm_entry {
276: my ($input_node) = @_;
277: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
278: my $index_root = $index_doc->createElementNS($namespace, 'resource');
279: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
280: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
281: $index_doc->setDocumentElement($index_root);
282:
283: # try to find the document directory
284: my $doc_dir = "";
285: if ($cw_mode) {
286: $doc_dir = find_cw_dir($input_node);
287: } else {
288: $doc_dir = find_arch_dir($input_node);
289: }
290: if (! $doc_dir) {
291: logger('ERROR', "document directory not found! skipping...");
292: $errcnt++;
293: return;
294: }
295:
296: # add standard stuff to index.meta
297: my ($docname, $docpath) = split_file_path($doc_dir);
298: # name and date
299: create_text_path('name', $docname, $index_root, $namespace);
300: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
301: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
302: create_text_path('creator', 'digigroup', $index_root, $namespace);
303: create_text_path('description', 'a scanned document', $index_root, $namespace);
304: # acquisition
305: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
306: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
307: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
308: # media
309: create_text_path('media-type', 'image', $index_root, $namespace);
310: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
311:
312: # convert bib entries
313: my $cnt = convert_bib($input_node, $index_root, $index_doc);
314: if ($cnt == 0) {
315: # error or nothing to convert
316: logger('ERROR', "no bibliographic metadata!");
317: $errcnt++;
318: return;
319: }
320:
321: # write new index.meta file
322: write_xml($index_doc, "$doc_dir/index.meta");
323:
324: }
325:
326:
327:
328:
329:
330: #######################################################
331: # Main
332: #
333:
334: # load filemaker xml dump
335: my ($input_doc, $input_root) = read_xml($infile);
336: # set namespace prefix
337: my $fm_namespace = $input_root->namespaceURI();
338: $input_root->setNamespace($fm_namespace, 'fm', 1);
339:
340: process_all_fm_entries($input_root);
341:
342:
343: logger("INFO", "$warncnt warnings");
344: logger("INFO", "$errcnt errors");
345: if ($errcnt > 0) {
346: logger("ABORT", "there were errors!");
347: exit 1;
348: } else {
349: logger("DONE", "done something successfully!");
350: }
351:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>