21
|
1 #!/usr/local/bin/perl -w
|
|
2
|
|
3 use strict;
|
|
4 use XML::LibXML;
|
|
5
|
|
6 use lib '/usr/local/mpiwg/archive_devel';
|
|
7 use MPIWGStor;
|
|
8
|
|
9 # make output unbuffered
|
|
10 $|=1;
|
|
11
|
|
12 # program version
|
|
13 my $version = "0.2 (19.9.2005 ROC)";
|
|
14 my $help =
|
|
15 "use: makemeta-vlp [options] file.xml
|
|
16 options:
|
|
17 -debug show debugging info
|
|
18 -dry-run simulate, dont'do anything
|
|
19 -replace replace existing index files
|
|
20 -online-mode mode for creating online/permanent files
|
|
21 -archive-mode mode for creating archive/data files
|
|
22 ";
|
|
23 logger("INFO", "makemeta-vlp $version");
|
|
24
|
|
25 ###########################################
|
|
26 # mappings
|
|
27
|
|
28 # generic mappings at top level
|
|
29 my %gen_map = (
|
|
30 'Custom2_Language' => 'meta/lang'
|
|
31 );
|
|
32 # sub type switch tag
|
|
33 my %type_map = (
|
|
34 'ReferenceType' => 'meta/bib@type'
|
|
35 );
|
|
36 # sub type mappings
|
|
37 my %subtype_map = (
|
|
38 'Book' => {
|
|
39 '_name' => 'book',
|
|
40 'Author' => 'meta/bib/author',
|
|
41 'Title' => 'meta/bib/title',
|
|
42 'Year' => 'meta/bib/year',
|
|
43 'Place_Published' => 'meta/bib/city',
|
|
44 'Publisher' => 'meta/bib/publisher',
|
|
45 'Edition' => 'meta/bib/edition',
|
|
46 'Volume' => 'meta/bib/volume',
|
|
47 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
|
|
48 'Pages' => 'meta/bib/number-of-pages'
|
|
49 },
|
|
50 'Book Section' => {
|
|
51 '_name' => 'inbook',
|
|
52 'Author' => 'meta/bib/author',
|
|
53 'Title' => 'meta/bib/title',
|
|
54 'Year' => 'meta/bib/year',
|
|
55 'Secondary_Title' => 'meta/bib/book-title',
|
|
56 'SecondaryAuthor' => 'meta/bib/editor',
|
|
57 'Volume' => 'meta/bib/volume',
|
|
58 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
|
|
59 'Pages' => 'meta/bib/pages'
|
|
60 },
|
|
61 'Edited Book' => {
|
|
62 '_name' => 'edited-book',
|
|
63 'Author' => 'meta/bib/editor',
|
|
64 'Title' => 'meta/bib/title',
|
|
65 'Year' => 'meta/bib/year',
|
|
66 'Place_Published' => 'meta/bib/city',
|
|
67 'Publisher' => 'meta/bib/publisher',
|
|
68 'Edition' => 'meta/bib/edition',
|
|
69 'Volume' => 'meta/bib/volume',
|
|
70 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
|
|
71 'Pages' => 'meta/bib/number-of-pages'
|
|
72 },
|
|
73 'Journal Article' => {
|
|
74 '_name' => 'journal-article',
|
|
75 'Author' => 'meta/bib/author',
|
|
76 'Title' => 'meta/bib/title',
|
|
77 'Year' => 'meta/bib/year',
|
|
78 'SecondaryTitle' => 'meta/bib/journal',
|
|
79 'Volume' => 'meta/bib/volume',
|
|
80 'Number_Issue' => 'meta/bib/issue',
|
|
81 'Pages' => 'meta/bib/pages'
|
|
82 },
|
|
83 'Magazine Article' => {
|
|
84 '_name' => 'magazine-article',
|
|
85 'Author' => 'meta/bib/author',
|
|
86 'Title' => 'meta/bib/title',
|
|
87 'Year' => 'meta/bib/year',
|
|
88 'Secondary_Title' => 'meta/bib/magazine',
|
|
89 'Number_Issue' => 'meta/bib/issue-number',
|
|
90 'Date' => 'meta/bib/issue-date',
|
|
91 'Pages' => 'meta/bib/pages'
|
|
92 },
|
|
93 'Report' => {
|
|
94 '_name' => 'report',
|
|
95 'Author' => 'meta/bib/author',
|
|
96 'Title' => 'meta/bib/title',
|
|
97 'Year' => 'meta/bib/year',
|
|
98 'Place_Published' => 'meta/bib/city',
|
|
99 'Date' => 'meta/bib/date',
|
|
100 'SecondaryTitle' => 'meta/bib/type',
|
|
101 'Pages' => 'meta/bib/pages'
|
|
102 },
|
|
103 'Trade Catalogue' => {
|
|
104 '_name' => 'report',
|
|
105 'Author' => 'meta/bib/author',
|
|
106 'Title' => 'meta/bib/title',
|
|
107 'Year' => 'meta/bib/year',
|
|
108 'Place_Published' => 'meta/bib/city',
|
|
109 'Date' => 'meta/bib/date',
|
|
110 'Volume' => 'meta/bib/volume',
|
|
111 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
|
|
112 'ReferenceType' => 'meta/bib/type',
|
|
113 'Pages' => 'meta/bib/pages'
|
|
114 },
|
|
115 'Thesis' => {
|
|
116 '_name' => 'thesis',
|
|
117 'Author' => 'meta/bib/author',
|
|
118 'Title' => 'meta/bib/title',
|
|
119 'Place_Published' => 'meta/bib/city',
|
|
120 'Publisher' => 'meta/bib/university',
|
|
121 'Date' => 'meta/bib/date',
|
|
122 'TypeOfWork' => 'meta/bib/type',
|
|
123 'Pages' => 'meta/bib/number-of-pages'
|
|
124 },
|
|
125 'Manuscript' => {
|
|
126 '_name' => 'manuscript',
|
|
127 'Author' => 'meta/bib/author',
|
|
128 'Title' => 'meta/bib/title',
|
|
129 'Year' => 'meta/bib/year',
|
|
130 'Place_Published' => 'meta/bib/location',
|
|
131 'Pages' => 'meta/bib/pages'
|
|
132 }
|
|
133 );
|
|
134 # language element
|
|
135 my $lang_field = 'Custom2_Language';
|
|
136 # languages to iso codes
|
|
137 my %lang_map = (
|
|
138 'German' => 'de',
|
|
139 'English' => 'en',
|
|
140 'Italian' => 'it',
|
|
141 'French' => 'fr',
|
|
142 'Latin' => 'la',
|
|
143 'Japanese' => 'ja',
|
|
144 'Dutch' => 'nl',
|
|
145 'Spanish' => 'es',
|
|
146 'Swedish' => 'sv'
|
|
147 );
|
|
148 # storage fields
|
|
149 my $arch_id_field = 'ID';
|
|
150
|
|
151 #######################################################
|
|
152 # internal parameters
|
|
153 #
|
|
154
|
|
155 # storage
|
|
156 my $lib_arch_dir = '/mpiwg/archive/data/vlp';
|
|
157 my $lib_online_dir = '/mpiwg/online/permanent/vlp';
|
|
158
|
|
159 # read command line parameters
|
|
160 my $args = MPIWGStor::parseargs;
|
|
161 if (! scalar(%$args)) {
|
|
162 print $help, "\n";
|
|
163 exit 1;
|
|
164 }
|
|
165
|
|
166 # debug level
|
|
167 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
|
|
168
|
|
169 # simulate action only
|
|
170 my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
|
|
171 logger('DEBUG', "dry-run: $dry_run");
|
|
172
|
|
173 # replace existing index files
|
|
174 my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
|
|
175 logger('DEBUG', "replace: $do_replace");
|
|
176
|
|
177 # use online mode
|
|
178 my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
|
|
179 logger('DEBUG', "online_mode: $online_mode");
|
|
180
|
|
181 # use archive mode
|
|
182 my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
|
|
183 logger('DEBUG', "archive_mode: $archive_mode");
|
|
184
|
|
185 # index.meta namespace (not really implemented!)
|
|
186 my $namespace = "";
|
|
187
|
|
188
|
|
189 my $xml_changed = 0;
|
|
190 my $errcnt = 0;
|
|
191 my $warncnt = 0;
|
|
192
|
|
193 #######################################################
|
|
194 # check parameters that were passed to the program
|
|
195 #
|
|
196 my $infile = $$args{'path'};
|
|
197 if (! $infile) {
|
|
198 logger("ABORT", "no input file given!");
|
|
199 exit 1;
|
|
200 }
|
|
201 # strip double slashes
|
|
202 $infile =~ s/\/\//\//;
|
|
203 if (! -f $infile) {
|
|
204 logger("ABORT", "input file \'$infile\' doesn't exist!");
|
|
205 exit 1;
|
|
206 }
|
|
207
|
|
208
|
|
209 #######################################################
|
|
210 # subroutines
|
|
211 #
|
|
212
|
|
213
|
|
214 sub find_arch_dir {
|
|
215 my ($input_node) = @_;
|
|
216 my $dir = "";
|
|
217
|
|
218 my $bib_id = $input_node->findvalue("fm:$arch_id_field");
|
|
219 #logger('DEBUG', "bibdir: $bib_dir");
|
|
220 if ($bib_id) {
|
|
221 $dir = "$lib_arch_dir/lit$bib_id";
|
|
222 if (-d $dir) {
|
|
223 logger('DEBUG', "directory $dir exists");
|
|
224 return $dir;
|
|
225 }
|
|
226 }
|
|
227 return;
|
|
228 }
|
|
229
|
|
230 sub find_permanent_dir {
|
|
231 my ($input_node) = @_;
|
|
232 my $online_base = '/mpiwg/online/permanent';
|
|
233 my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
|
|
234 if (! $dest_id) {
|
|
235 logger('ERROR', "no ID field for online permanent entry");
|
|
236 $errcnt++;
|
|
237 return;
|
|
238 }
|
|
239 my $dir = "$online_base/lit$dest_id";
|
|
240 return $dir;
|
|
241 }
|
|
242
|
|
243
|
|
244 sub convert_bib {
|
|
245 my ($input_node, $index_root, $index_doc) = @_;
|
|
246 my $cnt = 0;
|
|
247 my $type = "";
|
|
248 my $type_path = "";
|
|
249
|
|
250 # process general stuff first
|
|
251 foreach my $n ($input_node->getChildNodes()) {
|
|
252 my $name = $n->nodeName();
|
|
253 my $val = $n->textContent();
|
|
254 #logger('DEBUG', " NODE: $name = '$val'");
|
|
255 if (exists $gen_map{$name}) {
|
|
256 # is a general field
|
|
257 if ($name eq $lang_field) {
|
|
258 # language field
|
|
259 if (not $val) {
|
|
260 logger('WARNING', "no language tag");
|
|
261 $warncnt++;
|
|
262 next;
|
|
263 }
|
|
264 # convert to iso code
|
|
265 if (exists $lang_map{$val}) {
|
|
266 $val = $lang_map{$val};
|
|
267 } else {
|
|
268 logger('ERROR', "unknown language: $val! skipping...");
|
|
269 $errcnt++;
|
|
270 return 0;
|
|
271 }
|
|
272 }
|
|
273 create_element_path($gen_map{$name}, $index_root, $namespace)
|
|
274 ->appendTextNode($val);
|
|
275 $cnt++;
|
|
276 } elsif (exists $type_map{$name}) {
|
|
277 # is a type field
|
|
278 $type_path = $type_map{$name};
|
|
279 $type = $val;
|
|
280 # check with known types
|
|
281 if (exists $subtype_map{$val}) {
|
|
282 my $indextype = $subtype_map{$val}->{'_name'};
|
|
283 create_element_path("$type_path=$indextype", $index_root, $namespace);
|
|
284 $cnt++;
|
|
285 } else {
|
|
286 logger('ERROR', "unknown bib type $val! skipping...");
|
|
287 $errcnt++;
|
|
288 return 0;
|
|
289 }
|
|
290 }
|
|
291 }
|
|
292 # process sub type fields
|
|
293 if ($type) {
|
|
294 foreach my $n ($input_node->getChildNodes()) {
|
|
295 my $name = $n->nodeName();
|
|
296 my $val = $n->textContent();
|
|
297 #logger('DEBUG', " NODE: $name = '$val'");
|
|
298 if (exists $subtype_map{$type}->{$name}) {
|
|
299 create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
|
|
300 ->appendTextNode($val);
|
|
301 $cnt++;
|
|
302 }
|
|
303 }
|
|
304 }
|
|
305 return $cnt;
|
|
306 }
|
|
307
|
|
308
|
|
309
|
|
310 sub process_all_fm_entries {
|
|
311 my ($input_root) = @_;
|
|
312 my $cnt = 0;
|
|
313
|
|
314 foreach my $n ($input_root->findnodes('fm:ROW')) {
|
|
315 logger('INFO', "processing entry $cnt ...");
|
|
316 process_fm_entry($n);
|
|
317 $cnt++;
|
|
318 }
|
|
319 }
|
|
320
|
|
321
|
|
322 sub process_fm_entry {
|
|
323 my ($input_node) = @_;
|
|
324 my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
|
|
325 my $index_root = $index_doc->createElementNS($namespace, 'resource');
|
|
326 $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
|
|
327 $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
|
|
328 $index_doc->setDocumentElement($index_root);
|
|
329
|
|
330 # try to find the document directory
|
|
331 my $doc_dir = "";
|
|
332 if ($online_mode) {
|
|
333 $doc_dir = find_permanent_dir($input_node);
|
|
334 } elsif ($archive_mode) {
|
|
335 $doc_dir = find_arch_dir($input_node);
|
|
336 } else {
|
|
337 $doc_dir = find_permanent_dir($input_node);
|
|
338 }
|
|
339 if (! $doc_dir) {
|
|
340 logger('ERROR', "document directory not found! skipping...");
|
|
341 $errcnt++;
|
|
342 return;
|
|
343 }
|
|
344
|
|
345 # check if index.meta exists
|
|
346 if ( -f "$doc_dir/index.meta") {
|
|
347 if (not $do_replace) {
|
|
348 logger('DEBUG', "index file in $doc_dir exists");
|
|
349 return;
|
|
350 }
|
|
351 }
|
|
352
|
|
353 # add standard stuff to index.meta
|
|
354 my ($docname, $docpath) = split_file_path($doc_dir);
|
|
355 # name and date
|
|
356 create_text_path('name', $docname, $index_root, $namespace);
|
|
357 create_text_path('archive-path', $doc_dir, $index_root, $namespace);
|
|
358 create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
|
|
359 create_text_path('creator', 'vlp', $index_root, $namespace);
|
|
360 create_text_path('description', 'a scanned document', $index_root, $namespace);
|
|
361 if ($archive_mode) {
|
|
362 # acquisition
|
|
363 create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
|
|
364 create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
|
|
365 create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
|
|
366 # image acquisition
|
|
367 create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
|
|
368 create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
|
|
369 create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
|
|
370 }
|
|
371 # media
|
|
372 create_text_path('media-type', 'image', $index_root, $namespace);
|
|
373 create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
|
|
374
|
|
375 # convert bib entries
|
|
376 my $cnt = convert_bib($input_node, $index_root, $index_doc);
|
|
377 if ($cnt == 0) {
|
|
378 # error or nothing to convert
|
|
379 logger('ERROR', "no bibliographic metadata!");
|
|
380 $errcnt++;
|
|
381 return;
|
|
382 }
|
|
383
|
|
384 # write new index.meta file
|
|
385 if ($dry_run) {
|
|
386 logger('DEBUG', "would write $doc_dir/index.meta");
|
|
387 logger('DEBUG', $index_doc->toString(1));
|
|
388 } else {
|
|
389 write_xml($index_doc, "$doc_dir/index.meta");
|
|
390 }
|
|
391
|
|
392 }
|
|
393
|
|
394
|
|
395
|
|
396
|
|
397
|
|
398 #######################################################
|
|
399 # Main
|
|
400 #
|
|
401
|
|
402 # load filemaker xml dump
|
|
403 my ($input_doc, $input_root) = read_xml($infile);
|
|
404 # set namespace prefix
|
|
405 my $fm_namespace = $input_root->namespaceURI();
|
|
406 $input_root->setNamespace($fm_namespace, 'fm', 1);
|
|
407
|
|
408
|
|
409 process_all_fm_entries($input_root);
|
|
410
|
|
411
|
|
412 logger("INFO", "$warncnt warnings");
|
|
413 logger("INFO", "$errcnt errors");
|
|
414 if ($errcnt > 0) {
|
|
415 logger("ABORT", "there were errors!");
|
|
416 exit 1;
|
|
417 } else {
|
|
418 logger("DONE", "done something successfully!");
|
|
419 }
|
|
420
|