1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
6: use lib '/usr/local/mpiwg/archive_devel';
7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
13: my $version = "0.2 (19.9.2005 ROC)";
14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
22: ";
23: logger("INFO", "makemeta-vlp $version");
24:
25: ###########################################
26: # mappings
27:
28: # generic mappings at top level
29: my %gen_map = (
30: 'Custom2_Language' => 'meta/lang'
31: );
32: # sub type switch tag
33: my %type_map = (
34: 'ReferenceType' => 'meta/bib@type'
35: );
36: # sub type mappings
37: my %subtype_map = (
38: 'Book' => {
39: '_name' => 'book',
40: 'Author' => 'meta/bib/author',
41: 'Title' => 'meta/bib/title',
42: 'Year' => 'meta/bib/year',
43: 'Place_Published' => 'meta/bib/city',
44: 'Publisher' => 'meta/bib/publisher',
45: 'Edition' => 'meta/bib/edition',
46: 'Volume' => 'meta/bib/volume',
47: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
48: 'Pages' => 'meta/bib/number-of-pages'
49: },
50: 'Book Section' => {
51: '_name' => 'inbook',
52: 'Author' => 'meta/bib/author',
53: 'Title' => 'meta/bib/title',
54: 'Year' => 'meta/bib/year',
55: 'Secondary_Title' => 'meta/bib/book-title',
56: 'SecondaryAuthor' => 'meta/bib/editor',
57: 'Volume' => 'meta/bib/volume',
58: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
59: 'Pages' => 'meta/bib/pages'
60: },
61: 'Edited Book' => {
62: '_name' => 'edited-book',
63: 'Author' => 'meta/bib/editor',
64: 'Title' => 'meta/bib/title',
65: 'Year' => 'meta/bib/year',
66: 'Place_Published' => 'meta/bib/city',
67: 'Publisher' => 'meta/bib/publisher',
68: 'Edition' => 'meta/bib/edition',
69: 'Volume' => 'meta/bib/volume',
70: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
71: 'Pages' => 'meta/bib/number-of-pages'
72: },
73: 'Journal Article' => {
74: '_name' => 'journal-article',
75: 'Author' => 'meta/bib/author',
76: 'Title' => 'meta/bib/title',
77: 'Year' => 'meta/bib/year',
78: 'SecondaryTitle' => 'meta/bib/journal',
79: 'Volume' => 'meta/bib/volume',
80: 'Number_Issue' => 'meta/bib/issue',
81: 'Pages' => 'meta/bib/pages'
82: },
83: 'Magazine Article' => {
84: '_name' => 'magazine-article',
85: 'Author' => 'meta/bib/author',
86: 'Title' => 'meta/bib/title',
87: 'Year' => 'meta/bib/year',
88: 'Secondary_Title' => 'meta/bib/magazine',
89: 'Number_Issue' => 'meta/bib/issue-number',
90: 'Date' => 'meta/bib/issue-date',
91: 'Pages' => 'meta/bib/pages'
92: },
93: 'Report' => {
94: '_name' => 'report',
95: 'Author' => 'meta/bib/author',
96: 'Title' => 'meta/bib/title',
97: 'Year' => 'meta/bib/year',
98: 'Place_Published' => 'meta/bib/city',
99: 'Date' => 'meta/bib/date',
100: 'SecondaryTitle' => 'meta/bib/type',
101: 'Pages' => 'meta/bib/pages'
102: },
103: 'Trade Catalogue' => {
104: '_name' => 'report',
105: 'Author' => 'meta/bib/author',
106: 'Title' => 'meta/bib/title',
107: 'Year' => 'meta/bib/year',
108: 'Place_Published' => 'meta/bib/city',
109: 'Date' => 'meta/bib/date',
110: 'Volume' => 'meta/bib/volume',
111: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
112: 'ReferenceType' => 'meta/bib/type',
113: 'Pages' => 'meta/bib/pages'
114: },
115: 'Thesis' => {
116: '_name' => 'thesis',
117: 'Author' => 'meta/bib/author',
118: 'Title' => 'meta/bib/title',
119: 'Place_Published' => 'meta/bib/city',
120: 'Publisher' => 'meta/bib/university',
121: 'Date' => 'meta/bib/date',
122: 'TypeOfWork' => 'meta/bib/type',
123: 'Pages' => 'meta/bib/number-of-pages'
124: },
125: 'Manuscript' => {
126: '_name' => 'manuscript',
127: 'Author' => 'meta/bib/author',
128: 'Title' => 'meta/bib/title',
129: 'Year' => 'meta/bib/year',
130: 'Place_Published' => 'meta/bib/location',
131: 'Pages' => 'meta/bib/pages'
132: }
133: );
134: # language element
135: my $lang_field = 'Custom2_Language';
136: # languages to iso codes
137: my %lang_map = (
138: 'German' => 'de',
139: 'English' => 'en',
140: 'Italian' => 'it',
141: 'French' => 'fr',
142: 'Latin' => 'la',
143: 'Japanese' => 'ja',
144: 'Dutch' => 'nl',
145: 'Spanish' => 'es',
146: 'Swedish' => 'sv'
147: );
148: # storage fields
149: my $arch_id_field = 'ID';
150:
151: #######################################################
152: # internal parameters
153: #
154:
155: # storage
156: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
157: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
158:
159: # read command line parameters
160: my $args = MPIWGStor::parseargs;
161: if (! scalar(%$args)) {
162: print $help, "\n";
163: exit 1;
164: }
165:
166: # debug level
167: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
168:
169: # simulate action only
170: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
171: logger('DEBUG', "dry-run: $dry_run");
172:
173: # replace existing index files
174: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
175: logger('DEBUG', "replace: $do_replace");
176:
177: # use online mode
178: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
179: logger('DEBUG', "online_mode: $online_mode");
180:
181: # use archive mode
182: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
183: logger('DEBUG', "archive_mode: $archive_mode");
184:
185: # index.meta namespace (not really implemented!)
186: my $namespace = "";
187:
188:
189: my $xml_changed = 0;
190: my $errcnt = 0;
191: my $warncnt = 0;
192:
193: #######################################################
194: # check parameters that were passed to the program
195: #
196: my $infile = $$args{'path'};
197: if (! $infile) {
198: logger("ABORT", "no input file given!");
199: exit 1;
200: }
201: # strip double slashes
202: $infile =~ s/\/\//\//;
203: if (! -f $infile) {
204: logger("ABORT", "input file \'$infile\' doesn't exist!");
205: exit 1;
206: }
207:
208:
209: #######################################################
210: # subroutines
211: #
212:
213:
214: sub find_arch_dir {
215: my ($input_node) = @_;
216: my $dir = "";
217:
218: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
219: #logger('DEBUG', "bibdir: $bib_dir");
220: if ($bib_id) {
221: $dir = "$lib_arch_dir/lit$bib_id";
222: if (-d $dir) {
223: logger('DEBUG', "directory $dir exists");
224: return $dir;
225: }
226: }
227: return;
228: }
229:
230: sub find_permanent_dir {
231: my ($input_node) = @_;
232: my $online_base = '/mpiwg/online/permanent';
233: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
234: if (! $dest_id) {
235: logger('ERROR', "no ID field for online permanent entry");
236: $errcnt++;
237: return;
238: }
239: my $dir = "$online_base/lit$dest_id";
240: return $dir;
241: }
242:
243:
244: sub convert_bib {
245: my ($input_node, $index_root, $index_doc) = @_;
246: my $cnt = 0;
247: my $type = "";
248: my $type_path = "";
249:
250: # process general stuff first
251: foreach my $n ($input_node->getChildNodes()) {
252: my $name = $n->nodeName();
253: my $val = $n->textContent();
254: #logger('DEBUG', " NODE: $name = '$val'");
255: if (exists $gen_map{$name}) {
256: # is a general field
257: if ($name eq $lang_field) {
258: # language field
259: if (not $val) {
260: logger('WARNING', "no language tag");
261: $warncnt++;
262: next;
263: }
264: # convert to iso code
265: if (exists $lang_map{$val}) {
266: $val = $lang_map{$val};
267: } else {
268: logger('ERROR', "unknown language: $val! skipping...");
269: $errcnt++;
270: return 0;
271: }
272: }
273: create_element_path($gen_map{$name}, $index_root, $namespace)
274: ->appendTextNode($val);
275: $cnt++;
276: } elsif (exists $type_map{$name}) {
277: # is a type field
278: $type_path = $type_map{$name};
279: $type = $val;
280: # check with known types
281: if (exists $subtype_map{$val}) {
282: my $indextype = $subtype_map{$val}->{'_name'};
283: create_element_path("$type_path=$indextype", $index_root, $namespace);
284: $cnt++;
285: } else {
286: logger('ERROR', "unknown bib type $val! skipping...");
287: $errcnt++;
288: return 0;
289: }
290: }
291: }
292: # process sub type fields
293: if ($type) {
294: foreach my $n ($input_node->getChildNodes()) {
295: my $name = $n->nodeName();
296: my $val = $n->textContent();
297: #logger('DEBUG', " NODE: $name = '$val'");
298: if (exists $subtype_map{$type}->{$name}) {
299: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
300: ->appendTextNode($val);
301: $cnt++;
302: }
303: }
304: }
305: return $cnt;
306: }
307:
308:
309:
310: sub process_all_fm_entries {
311: my ($input_root) = @_;
312: my $cnt = 0;
313:
314: foreach my $n ($input_root->findnodes('fm:ROW')) {
315: logger('INFO', "processing entry $cnt ...");
316: process_fm_entry($n);
317: $cnt++;
318: }
319: }
320:
321:
322: sub process_fm_entry {
323: my ($input_node) = @_;
324: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
325: my $index_root = $index_doc->createElementNS($namespace, 'resource');
326: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
327: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
328: $index_doc->setDocumentElement($index_root);
329:
330: # try to find the document directory
331: my $doc_dir = "";
332: if ($online_mode) {
333: $doc_dir = find_permanent_dir($input_node);
334: } elsif ($archive_mode) {
335: $doc_dir = find_arch_dir($input_node);
336: } else {
337: $doc_dir = find_permanent_dir($input_node);
338: }
339: if (! $doc_dir) {
340: logger('ERROR', "document directory not found! skipping...");
341: $errcnt++;
342: return;
343: }
344:
345: # check if index.meta exists
346: if ( -f "$doc_dir/index.meta") {
347: if (not $do_replace) {
348: logger('DEBUG', "index file in $doc_dir exists");
349: return;
350: }
351: }
352:
353: # add standard stuff to index.meta
354: my ($docname, $docpath) = split_file_path($doc_dir);
355: # name and date
356: create_text_path('name', $docname, $index_root, $namespace);
357: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
358: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
359: create_text_path('creator', 'vlp', $index_root, $namespace);
360: create_text_path('description', 'a scanned document', $index_root, $namespace);
361: if ($archive_mode) {
362: # acquisition
363: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
364: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
365: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
366: # image acquisition
367: create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
368: create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
369: create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
370: }
371: # media
372: create_text_path('media-type', 'image', $index_root, $namespace);
373: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
374:
375: # convert bib entries
376: my $cnt = convert_bib($input_node, $index_root, $index_doc);
377: if ($cnt == 0) {
378: # error or nothing to convert
379: logger('ERROR', "no bibliographic metadata!");
380: $errcnt++;
381: return;
382: }
383:
384: # write new index.meta file
385: if ($dry_run) {
386: logger('DEBUG', "would write $doc_dir/index.meta");
387: logger('DEBUG', $index_doc->toString(1));
388: } else {
389: write_xml($index_doc, "$doc_dir/index.meta");
390: }
391:
392: }
393:
394:
395:
396:
397:
398: #######################################################
399: # Main
400: #
401:
402: # load filemaker xml dump
403: my ($input_doc, $input_root) = read_xml($infile);
404: # set namespace prefix
405: my $fm_namespace = $input_root->namespaceURI();
406: $input_root->setNamespace($fm_namespace, 'fm', 1);
407:
408:
409: process_all_fm_entries($input_root);
410:
411:
412: logger("INFO", "$warncnt warnings");
413: logger("INFO", "$errcnt errors");
414: if ($errcnt > 0) {
415: logger("ABORT", "there were errors!");
416: exit 1;
417: } else {
418: logger("DONE", "done something successfully!");
419: }
420:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>