#!/usr/local/bin/perl -w use strict; use XML::LibXML; use lib '/usr/local/mpiwg/archive'; use MPIWGStor; # make output unbuffered $|=1; ####################################################### # internal parameters # # program version my $version = "0.1.0 (24.5.2005)"; logger("INFO", "makemeta-lib $version"); # # mappings # # generic mappings at top level my %gen_map = ( 'Device' => 'meta/image-acquisition/device', 'Image_Type' => 'meta/image-acquisition/image-type', 'Production_Comment' => 'meta/image-acquisition/production-comment', 'Postproduction' => 'meta/image-acquisition/production-comment', 'Language' => 'meta/lang' ); # sub type switch tag my %type_map = ( 'Reference_Type' => 'meta/bib@type' ); # sub type mappings my %subtype_map = ( 'Book' => { '_name' => 'book', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Place_Published' => 'meta/bib/city', 'Publisher' => 'meta/bib/publisher', 'Edition' => 'meta/bib/edition' }, 'Journal Article' => { '_name' => 'journal-article', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Secondary_Title' => 'meta/bib/journal', 'Volume' => 'meta/bib/volume', 'Number' => 'meta/bib/issue', 'Pages' => 'meta/bib/pages' }, 'In Book' => { '_name' => 'inbook', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Secondary_Title' => 'meta/bib/book-title', 'Pages' => 'meta/bib/pages' }, 'Newspaper Article' => { '_name' => 'newspaper-article', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Secondary_Title' => 'meta/bib/newspaper', 'Place_Published' => 'meta/bib/city', 'Number' => 'meta/bib/issue-date', 'Pages' => 'meta/bib/pages' }, 'Edited Book' => { '_name' => 'edited-book', 'Author' => 'meta/bib/editor', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Place_Published' => 'meta/bib/city', 'Publisher' => 'meta/bib/publisher', 'Edition' => 'meta/bib/edition' }, 'Manuscript' => { '_name' => 'manuscript', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', 'Place_Published' => 'meta/bib/location', } ); # language element my $lang_field = 'Language'; # languages to iso codes my %lang_map = ( 'German' => 'de', 'English' => 'en', 'Italian' => 'it', 'French' => 'fr', 'Latin' => 'la' ); # storage fields my $arch_id_field = 'ID_Archive'; my $online_url_field = 'URL'; # more storage my $lib_arch_dir = '/mpiwg/archive/data/library'; my $lib_online_dir = '/mpiwg/online/permanent'; # read command line parameters my $args = MPIWGStor::parseargs; # debug level $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; # use einstein-cw mode my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0; # index.meta namespace (not really implemented!) my $namespace = ""; my $xml_changed = 0; my $errcnt = 0; my $warncnt = 0; ####################################################### # check parameters that were passed to the program # my $infile = $$args{'path'}; if (! $infile) { logger("ABORT", "no input file given!"); exit 1; } # strip double slashes $infile =~ s/\/\//\//; if (! -f $infile) { logger("ABORT", "input file \'$infile\' doesn't exist!"); exit 1; } ####################################################### # subroutines # sub find_cw_dir { my ($input_node) = @_; my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup'); my $dest_id = $input_node->findvalue("fm:$arch_id_field"); if (! $dest_id) { logger('ERROR', "no ID field for einstein-cw entry"); $errcnt++; return; } my $dir = "$lib_arch_dir/$dest_id"; logger('DEBUG', "moving $src_dir to $dir"); if (rename $src_dir, $dir) { if (-d $dir) { logger('DEBUG', "directory $dir OK"); return $dir; } } else { logger('ABORT', "unable to rename directory $src_dir to $dir!"); exit 1; } return; } sub find_online_dir { my ($input_node, $base_dir) = @_; $base_dir = $lib_online_dir unless ($base_dir); my $online_url = $input_node->findvalue("fm:$online_url_field"); if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) { my $online_dir = $1; #logger("DEBUG", "dir: $base_dir/$online_dir"); my $dir = "$base_dir/$online_dir"; if (-d $dir) { logger('DEBUG', "directory $dir exists"); return $dir; } } return; } sub find_arch_dir { my ($input_node) = @_; my $dir = ""; my $bib_dir = $input_node->findvalue("fm:$arch_id_field"); #logger('DEBUG', "bibdir: $bib_dir"); if ($bib_dir) { $dir = "$lib_arch_dir/$bib_dir"; if (-d $dir) { logger('DEBUG', "directory $dir exists"); return $dir; } } return; } sub convert_bib { my ($input_node, $index_root, $index_doc) = @_; my $cnt = 0; my $type = ""; my $type_path = ""; # process general stuff first foreach my $n ($input_node->getChildNodes()) { my $name = $n->nodeName(); my $val = $n->textContent(); #logger('DEBUG', " NODE: $name = '$val'"); if (exists $gen_map{$name}) { # is a general field if ($name eq $lang_field) { # language field -> convert to iso code if (exists $lang_map{$val}) { $val = $lang_map{$val}; } else { logger('ERROR', "unknown language: $val! skipping..."); $errcnt++; return 0; } } create_element_path($gen_map{$name}, $index_root, $namespace) ->appendTextNode($val); $cnt++; } elsif (exists $type_map{$name}) { # is a type field $type_path = $type_map{$name}; $type = $val; # check with known types if (exists $subtype_map{$val}) { my $indextype = $subtype_map{$val}->{'_name'}; create_element_path("$type_path=$indextype", $index_root, $namespace); $cnt++; } else { logger('ERROR', 'unknown bib type $val! skipping...'); $errcnt++; return 0; } } } # process sub type fields if ($type) { foreach my $n ($input_node->getChildNodes()) { my $name = $n->nodeName(); my $val = $n->textContent(); #logger('DEBUG', " NODE: $name = '$val'"); if (exists $subtype_map{$type}->{$name}) { create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) ->appendTextNode($val); $cnt++; } } } return $cnt; } sub process_all_fm_entries { my ($input_root) = @_; my $cnt = 0; foreach my $n ($input_root->findnodes('fm:ROW')) { logger('INFO', "processing entry $cnt ..."); process_fm_entry($n); } } sub process_fm_entry { my ($input_node) = @_; my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); my $index_root = $index_doc->createElementNS($namespace, 'resource'); $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); $index_doc->setDocumentElement($index_root); # try to find the document directory my $doc_dir = ""; if ($cw_mode) { $doc_dir = find_cw_dir($input_node); } else { $doc_dir = find_arch_dir($input_node); } if (! $doc_dir) { logger('ERROR', "document directory not found! skipping..."); $errcnt++; return; } # add standard stuff to index.meta my ($docname, $docpath) = split_file_path($doc_dir); # name and date create_text_path('name', $docname, $index_root, $namespace); create_text_path('archive-path', $doc_dir, $index_root, $namespace); create_text_path('archive-creation-date', stime(time), $index_root, $namespace); create_text_path('creator', 'digigroup', $index_root, $namespace); create_text_path('description', 'a scanned document', $index_root, $namespace); # acquisition create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace); create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); # media create_text_path('media-type', 'image', $index_root, $namespace); create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); # convert bib entries my $cnt = convert_bib($input_node, $index_root, $index_doc); if ($cnt == 0) { # error or nothing to convert logger('ERROR', "no bibliographic metadata!"); $errcnt++; return; } # write new index.meta file write_xml($index_doc, "$doc_dir/index.meta"); } ####################################################### # Main # # load filemaker xml dump my ($input_doc, $input_root) = read_xml($infile); # set namespace prefix my $fm_namespace = $input_root->namespaceURI(); $input_root->setNamespace($fm_namespace, 'fm', 1); process_all_fm_entries($input_root); logger("INFO", "$warncnt warnings"); logger("INFO", "$errcnt errors"); if ($errcnt > 0) { logger("ABORT", "there were errors!"); exit 1; } else { logger("DONE", "done something successfully!"); }