Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.3
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::LibXML;
5:
1.2 casties 6: use lib '/usr/local/mpiwg/archive';
1.1 casties 7: use MPIWGStor;
8:
9: # make output unbuffered
10: $|=1;
11:
12: # program version
1.3 ! casties 13: my $version = "0.2.1 (12.6.2006 ROC)";
1.1 casties 14: my $help =
15: "use: makemeta-vlp [options] file.xml
16: options:
17: -debug show debugging info
18: -dry-run simulate, dont'do anything
19: -replace replace existing index files
20: -online-mode mode for creating online/permanent files
21: -archive-mode mode for creating archive/data files
22: ";
23: logger("INFO", "makemeta-vlp $version");
24:
25: ###########################################
26: # mappings
27:
28: # generic mappings at top level
29: my %gen_map = (
30: 'Custom2_Language' => 'meta/lang'
31: );
32: # sub type switch tag
33: my %type_map = (
34: 'ReferenceType' => 'meta/bib@type'
35: );
36: # sub type mappings
37: my %subtype_map = (
38: 'Book' => {
39: '_name' => 'book',
40: 'Author' => 'meta/bib/author',
41: 'Title' => 'meta/bib/title',
42: 'Year' => 'meta/bib/year',
43: 'Place_Published' => 'meta/bib/city',
44: 'Publisher' => 'meta/bib/publisher',
45: 'Edition' => 'meta/bib/edition',
46: 'Volume' => 'meta/bib/volume',
47: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
48: 'Pages' => 'meta/bib/number-of-pages'
49: },
1.3 ! casties 50: '(Book)' => {
! 51: '_name' => 'book',
! 52: 'Author' => 'meta/bib/author',
! 53: 'Title' => 'meta/bib/title',
! 54: 'Year' => 'meta/bib/year',
! 55: 'Place_Published' => 'meta/bib/city',
! 56: 'Publisher' => 'meta/bib/publisher',
! 57: 'Edition' => 'meta/bib/edition',
! 58: 'Volume' => 'meta/bib/volume',
! 59: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
! 60: 'Pages' => 'meta/bib/number-of-pages',
! 61: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
! 62: },
1.1 casties 63: 'Book Section' => {
64: '_name' => 'inbook',
65: 'Author' => 'meta/bib/author',
66: 'Title' => 'meta/bib/title',
67: 'Year' => 'meta/bib/year',
1.3 ! casties 68: 'SecondaryTitle' => 'meta/bib/book-title',
1.1 casties 69: 'SecondaryAuthor' => 'meta/bib/editor',
70: 'Volume' => 'meta/bib/volume',
71: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
72: 'Pages' => 'meta/bib/pages'
73: },
74: 'Edited Book' => {
75: '_name' => 'edited-book',
76: 'Author' => 'meta/bib/editor',
77: 'Title' => 'meta/bib/title',
78: 'Year' => 'meta/bib/year',
79: 'Place_Published' => 'meta/bib/city',
80: 'Publisher' => 'meta/bib/publisher',
81: 'Edition' => 'meta/bib/edition',
82: 'Volume' => 'meta/bib/volume',
83: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
84: 'Pages' => 'meta/bib/number-of-pages'
85: },
86: 'Journal Article' => {
87: '_name' => 'journal-article',
88: 'Author' => 'meta/bib/author',
89: 'Title' => 'meta/bib/title',
90: 'Year' => 'meta/bib/year',
91: 'SecondaryTitle' => 'meta/bib/journal',
92: 'Volume' => 'meta/bib/volume',
93: 'Number_Issue' => 'meta/bib/issue',
94: 'Pages' => 'meta/bib/pages'
95: },
96: 'Magazine Article' => {
97: '_name' => 'magazine-article',
98: 'Author' => 'meta/bib/author',
99: 'Title' => 'meta/bib/title',
100: 'Year' => 'meta/bib/year',
101: 'Secondary_Title' => 'meta/bib/magazine',
102: 'Number_Issue' => 'meta/bib/issue-number',
103: 'Date' => 'meta/bib/issue-date',
104: 'Pages' => 'meta/bib/pages'
105: },
106: 'Report' => {
107: '_name' => 'report',
108: 'Author' => 'meta/bib/author',
109: 'Title' => 'meta/bib/title',
110: 'Year' => 'meta/bib/year',
111: 'Place_Published' => 'meta/bib/city',
112: 'Date' => 'meta/bib/date',
113: 'SecondaryTitle' => 'meta/bib/type',
114: 'Pages' => 'meta/bib/pages'
115: },
116: 'Trade Catalogue' => {
117: '_name' => 'report',
118: 'Author' => 'meta/bib/author',
119: 'Title' => 'meta/bib/title',
120: 'Year' => 'meta/bib/year',
121: 'Place_Published' => 'meta/bib/city',
122: 'Date' => 'meta/bib/date',
123: 'Volume' => 'meta/bib/volume',
124: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
125: 'ReferenceType' => 'meta/bib/type',
126: 'Pages' => 'meta/bib/pages'
127: },
128: 'Thesis' => {
129: '_name' => 'thesis',
130: 'Author' => 'meta/bib/author',
131: 'Title' => 'meta/bib/title',
132: 'Place_Published' => 'meta/bib/city',
133: 'Publisher' => 'meta/bib/university',
134: 'Date' => 'meta/bib/date',
135: 'TypeOfWork' => 'meta/bib/type',
136: 'Pages' => 'meta/bib/number-of-pages'
137: },
138: 'Manuscript' => {
139: '_name' => 'manuscript',
140: 'Author' => 'meta/bib/author',
141: 'Title' => 'meta/bib/title',
142: 'Year' => 'meta/bib/year',
143: 'Place_Published' => 'meta/bib/location',
144: 'Pages' => 'meta/bib/pages'
145: }
146: );
147: # language element
148: my $lang_field = 'Custom2_Language';
149: # languages to iso codes
150: my %lang_map = (
151: 'German' => 'de',
152: 'English' => 'en',
153: 'Italian' => 'it',
154: 'French' => 'fr',
155: 'Latin' => 'la',
156: 'Japanese' => 'ja',
157: 'Dutch' => 'nl',
158: 'Spanish' => 'es',
159: 'Swedish' => 'sv'
160: );
161: # storage fields
162: my $arch_id_field = 'ID';
163:
164: #######################################################
165: # internal parameters
166: #
167:
168: # storage
169: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
170: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
171:
172: # read command line parameters
173: my $args = MPIWGStor::parseargs;
174: if (! scalar(%$args)) {
175: print $help, "\n";
176: exit 1;
177: }
178:
179: # debug level
180: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
181:
182: # simulate action only
183: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
184: logger('DEBUG', "dry-run: $dry_run");
185:
186: # replace existing index files
187: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
188: logger('DEBUG', "replace: $do_replace");
189:
190: # use online mode
191: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
192: logger('DEBUG', "online_mode: $online_mode");
193:
194: # use archive mode
195: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
196: logger('DEBUG', "archive_mode: $archive_mode");
197:
198: # index.meta namespace (not really implemented!)
199: my $namespace = "";
200:
201:
202: my $xml_changed = 0;
203: my $errcnt = 0;
204: my $warncnt = 0;
205:
206: #######################################################
207: # check parameters that were passed to the program
208: #
209: my $infile = $$args{'path'};
210: if (! $infile) {
211: logger("ABORT", "no input file given!");
212: exit 1;
213: }
214: # strip double slashes
215: $infile =~ s/\/\//\//;
216: if (! -f $infile) {
217: logger("ABORT", "input file \'$infile\' doesn't exist!");
218: exit 1;
219: }
220:
221:
222: #######################################################
223: # subroutines
224: #
225:
226:
227: sub find_arch_dir {
228: my ($input_node) = @_;
229: my $dir = "";
230:
231: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
232: #logger('DEBUG', "bibdir: $bib_dir");
233: if ($bib_id) {
234: $dir = "$lib_arch_dir/lit$bib_id";
235: if (-d $dir) {
236: logger('DEBUG', "directory $dir exists");
237: return $dir;
238: }
239: }
240: return;
241: }
242:
243: sub find_permanent_dir {
244: my ($input_node) = @_;
245: my $online_base = '/mpiwg/online/permanent';
246: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
247: if (! $dest_id) {
248: logger('ERROR', "no ID field for online permanent entry");
249: $errcnt++;
250: return;
251: }
252: my $dir = "$online_base/lit$dest_id";
253: return $dir;
254: }
255:
256:
257: sub convert_bib {
258: my ($input_node, $index_root, $index_doc) = @_;
259: my $cnt = 0;
260: my $type = "";
261: my $type_path = "";
262:
263: # process general stuff first
264: foreach my $n ($input_node->getChildNodes()) {
265: my $name = $n->nodeName();
266: my $val = $n->textContent();
267: #logger('DEBUG', " NODE: $name = '$val'");
268: if (exists $gen_map{$name}) {
269: # is a general field
270: if ($name eq $lang_field) {
271: # language field
272: if (not $val) {
273: logger('WARNING', "no language tag");
274: $warncnt++;
275: next;
276: }
277: # convert to iso code
278: if (exists $lang_map{$val}) {
279: $val = $lang_map{$val};
280: } else {
281: logger('ERROR', "unknown language: $val! skipping...");
282: $errcnt++;
283: return 0;
284: }
285: }
286: create_element_path($gen_map{$name}, $index_root, $namespace)
287: ->appendTextNode($val);
288: $cnt++;
289: } elsif (exists $type_map{$name}) {
290: # is a type field
291: $type_path = $type_map{$name};
292: $type = $val;
293: # check with known types
294: if (exists $subtype_map{$val}) {
295: my $indextype = $subtype_map{$val}->{'_name'};
296: create_element_path("$type_path=$indextype", $index_root, $namespace);
297: $cnt++;
298: } else {
299: logger('ERROR', "unknown bib type $val! skipping...");
300: $errcnt++;
301: return 0;
302: }
303: }
304: }
305: # process sub type fields
306: if ($type) {
307: foreach my $n ($input_node->getChildNodes()) {
308: my $name = $n->nodeName();
309: my $val = $n->textContent();
310: #logger('DEBUG', " NODE: $name = '$val'");
311: if (exists $subtype_map{$type}->{$name}) {
312: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
313: ->appendTextNode($val);
314: $cnt++;
315: }
316: }
1.3 ! casties 317: # append additional constant fields (beginning with #)
! 318: foreach my $k (keys %{$subtype_map{$type}}) {
! 319: if ($k =~ /^\#(.*)/) {
! 320: my $val = $1;
! 321: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
! 322: }
! 323: }
1.1 casties 324: }
325: return $cnt;
326: }
327:
328:
329:
330: sub process_all_fm_entries {
331: my ($input_root) = @_;
332: my $cnt = 0;
333:
334: foreach my $n ($input_root->findnodes('fm:ROW')) {
335: logger('INFO', "processing entry $cnt ...");
336: process_fm_entry($n);
337: $cnt++;
338: }
339: }
340:
341:
342: sub process_fm_entry {
343: my ($input_node) = @_;
344: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
345: my $index_root = $index_doc->createElementNS($namespace, 'resource');
346: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
347: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
348: $index_doc->setDocumentElement($index_root);
349:
350: # try to find the document directory
351: my $doc_dir = "";
352: if ($online_mode) {
353: $doc_dir = find_permanent_dir($input_node);
354: } elsif ($archive_mode) {
355: $doc_dir = find_arch_dir($input_node);
356: } else {
357: $doc_dir = find_permanent_dir($input_node);
358: }
359: if (! $doc_dir) {
360: logger('ERROR', "document directory not found! skipping...");
361: $errcnt++;
362: return;
363: }
364:
365: # check if index.meta exists
366: if ( -f "$doc_dir/index.meta") {
367: if (not $do_replace) {
368: logger('DEBUG', "index file in $doc_dir exists");
369: return;
370: }
371: }
372:
373: # add standard stuff to index.meta
374: my ($docname, $docpath) = split_file_path($doc_dir);
375: # name and date
376: create_text_path('name', $docname, $index_root, $namespace);
377: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
378: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
379: create_text_path('creator', 'vlp', $index_root, $namespace);
380: create_text_path('description', 'a scanned document', $index_root, $namespace);
381: if ($archive_mode) {
382: # acquisition
383: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
384: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
385: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
386: # image acquisition
387: create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
388: create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
389: create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
390: }
391: # media
392: create_text_path('media-type', 'image', $index_root, $namespace);
393: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
394:
395: # convert bib entries
396: my $cnt = convert_bib($input_node, $index_root, $index_doc);
397: if ($cnt == 0) {
398: # error or nothing to convert
399: logger('ERROR', "no bibliographic metadata!");
400: $errcnt++;
401: return;
402: }
403:
404: # write new index.meta file
405: if ($dry_run) {
406: logger('DEBUG', "would write $doc_dir/index.meta");
407: logger('DEBUG', $index_doc->toString(1));
408: } else {
409: write_xml($index_doc, "$doc_dir/index.meta");
410: }
411:
412: }
413:
414:
415:
416:
417:
418: #######################################################
419: # Main
420: #
421:
422: # load filemaker xml dump
423: my ($input_doc, $input_root) = read_xml($infile);
424: # set namespace prefix
425: my $fm_namespace = $input_root->namespaceURI();
426: $input_root->setNamespace($fm_namespace, 'fm', 1);
427:
428:
429: process_all_fm_entries($input_root);
430:
431:
432: logger("INFO", "$warncnt warnings");
433: logger("INFO", "$errcnt errors");
434: if ($errcnt > 0) {
435: logger("ABORT", "there were errors!");
436: exit 1;
437: } else {
438: logger("DONE", "done something successfully!");
439: }
440:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>