Mercurial > hg > foxridge-archiver
changeset 12:620aad237f57
new tool makemeta-lib.pl to create index meta entries from the libraries online sources database
author | casties |
---|---|
date | Thu, 26 May 2005 16:31:28 +0200 |
parents | b19df18aa19a |
children | f33fb7f1898e |
files | MPIWGStor.pm makemeta-lib.pl |
diffstat | 2 files changed, 424 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/MPIWGStor.pm Wed Mar 23 13:12:08 2005 +0100 +++ b/MPIWGStor.pm Thu May 26 16:31:28 2005 +0200 @@ -1,4 +1,3 @@ -#!/usr/local/bin/perl -w # # Library with definitions and common routines for MPIWG storage system @@ -13,14 +12,15 @@ $file_perm); use Exporter; -$VERSION = 0.4; # ROC 20.1.2005 +$VERSION = 0.6; # ROC 26.5.2005 @ISA = qw(Exporter); @EXPORT = qw($debug %junk_files %index_files $index_file_perm $archived_index_file_perm $file_perm &parseargs &logger &stime &s2stime &ymd_date &split_file_path &sstrip - &valid_file_name &valid_dir_name &park_file &unpark_file &read_xml &write_xml); + &valid_file_name &valid_dir_name &park_file &unpark_file + &read_xml &write_xml &add_text_element &create_element_path &create_text_path); # debug level $debug = 0; @@ -264,15 +264,15 @@ exit 1; } - logger("INFO", "index file: $file"); + logger("DEBUG", "index file: $file"); eval { $document = $parser->parse_file($file) }; # catch parsing errors if ($@) { - logger("ABORT", "XML syntax error in index file!!"); + logger("ABORT", "XML syntax error in file $file!!"); exit 1; } my $rootnode = $document->documentElement; - logger("INFO", "root element: \'" . $rootnode->nodeName . "\'"); + logger("DEBUG", "root element: \'" . $rootnode->nodeName . "\'"); return ($document, $rootnode); } @@ -293,7 +293,8 @@ } } # write new file - if ($document->toFile($filename)) { + my ($fn, $dir) = split_file_path($filename); + if (-d $dir && $document->toFile($filename)) { logger("INFO", "written new file $filename"); chmod $index_file_perm, $filename; } else { @@ -302,6 +303,71 @@ } } +# +# $elem = add_text_element($node, $name, $value, $namespace) +# +# creates an XML element with the name $name and the text content +# $value attached to the node $node and returns it. +# +sub add_text_element { + my ($node, $name, $value, $namespace) = @_; + + my $doc = $node->ownerDocument; + my $text = $doc->createTextNode($value); + my $newnode = $node->addNewChild($namespace, $name); + $newnode->addChild($text); + + return $newnode; +} + +# +# $elem = create_element_path($path, $root, $namespace) +# +# creates and returns a DOM element at the given path from the +# given root. path is e.g. meta/bib@type=book. elements are separated +# by /, an additional attribute can be specified after the @. +# +sub create_element_path { + my ($path, $root, $namespace) = @_; + my $attribute = ""; + # get attribute + if ($path =~ /^(.+)@(.+)$/) { + $attribute = $2; + $path = $1; + } + my $point = $root; + for my $p (split /\//, $path) { + # check if the next path element exists + my $n = ($point->findnodes($p))[0]; + if ($n) { + $point = $n; + } else { + # create if it doesn't exist + $point = $point->addNewChild($namespace, $p); + } + } + # add the attribute + if ($attribute) { + my $dom = $root->getOwner(); + my ($attkey, $attval) = split /=/, $attribute; #/ silly fontlock... + $point->addChild($dom->createAttributeNS($namespace, $attkey, $attval)); + } + return $point; +} + +# +# $elem = create_text_path($path, $text, $root, $namespace) +# +# creates and returns a DOM text element with the given content at the +# given path from the given root. +# +sub create_text_path { + my ($path, $text, $root, $namespace) = @_; + my $elem = create_element_path($path, $root, $namespace)->appendTextNode($text); + return $elem; +} + + # module init return 1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/makemeta-lib.pl Thu May 26 16:31:28 2005 +0200 @@ -0,0 +1,351 @@ +#!/usr/local/bin/perl -w + +use strict; +use XML::LibXML; + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.1.0 (24.5.2005)"; +logger("INFO", "makemeta-lib $version"); + +# +# mappings +# +# generic mappings at top level +my %gen_map = ( + 'Device' => 'meta/image-acquisition/device', + 'Image_Type' => 'meta/image-acquisition/image-type', + 'Production_Comment' => 'meta/image-acquisition/production-comment', + 'Postproduction' => 'meta/image-acquisition/production-comment', + 'Language' => 'meta/lang' + ); +# sub type switch tag +my %type_map = ( + 'Reference_Type' => 'meta/bib@type' + ); +# sub type mappings +my %subtype_map = ( + 'Book' => { + '_name' => 'book', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition' + }, + 'Journal Article' => { + '_name' => 'journal-article', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/journal', + 'Volume' => 'meta/bib/volume', + 'Number' => 'meta/bib/issue', + 'Pages' => 'meta/bib/pages' + }, + 'In Book' => { + '_name' => 'inbook', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/book-title', + 'Pages' => 'meta/bib/pages' + }, + 'Newspaper Article' => { + '_name' => 'newspaper-article', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/newspaper', + 'Place_Published' => 'meta/bib/city', + 'Number' => 'meta/bib/issue-date', + 'Pages' => 'meta/bib/pages' + }, + 'Edited Book' => { + '_name' => 'edited-book', + 'Author' => 'meta/bib/editor', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition' + }, + 'Manuscript' => { + '_name' => 'manuscript', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/location', + } + ); +# language element +my $lang_field = 'Language'; +# languages to iso codes +my %lang_map = ( + 'German' => 'de', + 'English' => 'en', + 'Italian' => 'it', + 'French' => 'fr', + 'Latin' => 'la' + ); +# storage fields +my $arch_id_field = 'ID_Archive'; +my $online_url_field = 'URL'; + +# more storage +my $lib_arch_dir = '/mpiwg/archive/data/library'; +my $lib_online_dir = '/mpiwg/online/permanent'; + + +# read command line parameters +my $args = MPIWGStor::parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# use einstein-cw mode +my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0; + +# index.meta namespace (not really implemented!) +my $namespace = ""; + + +my $xml_changed = 0; +my $errcnt = 0; +my $warncnt = 0; + +####################################################### +# check parameters that were passed to the program +# +my $infile = $$args{'path'}; +if (! $infile) { + logger("ABORT", "no input file given!"); + exit 1; +} +# strip double slashes +$infile =~ s/\/\//\//; +if (! -f $infile) { + logger("ABORT", "input file \'$infile\' doesn't exist!"); + exit 1; +} + + +####################################################### +# subroutines +# + +sub find_cw_dir { + my ($input_node) = @_; + my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup'); + my $dest_id = $input_node->findvalue("fm:$arch_id_field"); + if (! $dest_id) { + logger('ERROR', "no ID field for einstein-cw entry"); + $errcnt++; + return; + } + my $dir = "$lib_arch_dir/$dest_id"; + logger('DEBUG', "moving $src_dir to $dir"); + if (rename $src_dir, $dir) { + if (-d $dir) { + logger('DEBUG', "directory $dir OK"); + return $dir; + } + } else { + logger('ABORT', "unable to rename directory $src_dir to $dir!"); + exit 1; + } + return; +} + +sub find_online_dir { + my ($input_node, $base_dir) = @_; + $base_dir = $lib_online_dir unless ($base_dir); + + my $online_url = $input_node->findvalue("fm:$online_url_field"); + if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) { + my $online_dir = $1; + #logger("DEBUG", "dir: $base_dir/$online_dir"); + my $dir = "$base_dir/$online_dir"; + if (-d $dir) { + logger('DEBUG', "directory $dir exists"); + return $dir; + } + } + return; +} + +sub find_arch_dir { + my ($input_node) = @_; + my $dir = ""; + + my $bib_dir = $input_node->findvalue("fm:$arch_id_field"); + #logger('DEBUG', "bibdir: $bib_dir"); + if ($bib_dir) { + $dir = "$lib_arch_dir/$bib_dir"; + if (-d $dir) { + logger('DEBUG', "directory $dir exists"); + return $dir; + } + } + return; +} + + +sub convert_bib { + my ($input_node, $index_root, $index_doc) = @_; + my $cnt = 0; + my $type = ""; + my $type_path = ""; + + # process general stuff first + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $gen_map{$name}) { + # is a general field + if ($name eq $lang_field) { + # language field -> convert to iso code + if (exists $lang_map{$val}) { + $val = $lang_map{$val}; + } else { + logger('ERROR', "unknown language: $val! skipping..."); + $errcnt++; + return 0; + } + } + create_element_path($gen_map{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } elsif (exists $type_map{$name}) { + # is a type field + $type_path = $type_map{$name}; + $type = $val; + # check with known types + if (exists $subtype_map{$val}) { + my $indextype = $subtype_map{$val}->{'_name'}; + create_element_path("$type_path=$indextype", $index_root, $namespace); + $cnt++; + } else { + logger('ERROR', 'unknown bib type $val! skipping...'); + $errcnt++; + return 0; + } + } + } + # process sub type fields + if ($type) { + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $subtype_map{$type}->{$name}) { + create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } + } + } + return $cnt; +} + + + +sub process_all_fm_entries { + my ($input_root) = @_; + my $cnt = 0; + + foreach my $n ($input_root->findnodes('fm:ROW')) { + logger('INFO', "processing entry $cnt ..."); + process_fm_entry($n); + } +} + + +sub process_fm_entry { + my ($input_node) = @_; + my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); + my $index_root = $index_doc->createElementNS($namespace, 'resource'); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); + $index_doc->setDocumentElement($index_root); + + # try to find the document directory + my $doc_dir = ""; + if ($cw_mode) { + $doc_dir = find_cw_dir($input_node); + } else { + $doc_dir = find_arch_dir($input_node); + } + if (! $doc_dir) { + logger('ERROR', "document directory not found! skipping..."); + $errcnt++; + return; + } + + # add standard stuff to index.meta + my ($docname, $docpath) = split_file_path($doc_dir); + # name and date + create_text_path('name', $docname, $index_root, $namespace); + create_text_path('archive-path', $doc_dir, $index_root, $namespace); + create_text_path('archive-creation-date', stime(time), $index_root, $namespace); + create_text_path('creator', 'digigroup', $index_root, $namespace); + create_text_path('description', 'a scanned document', $index_root, $namespace); + # acquisition + create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); + create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace); + create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); + # media + create_text_path('media-type', 'image', $index_root, $namespace); + create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); + + # convert bib entries + my $cnt = convert_bib($input_node, $index_root, $index_doc); + if ($cnt == 0) { + # error or nothing to convert + logger('ERROR', "no bibliographic metadata!"); + $errcnt++; + return; + } + + # write new index.meta file + write_xml($index_doc, "$doc_dir/index.meta"); + +} + + + + + +####################################################### +# Main +# + +# load filemaker xml dump +my ($input_doc, $input_root) = read_xml($infile); +# set namespace prefix +my $fm_namespace = $input_root->namespaceURI(); +$input_root->setNamespace($fm_namespace, 'fm', 1); + +process_all_fm_entries($input_root); + + +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + logger("DONE", "done something successfully!"); +} +