Mercurial > hg > foxridge-archiver
changeset 21:c8e4e8cb31dd
new tool for createing index files for vlp documents
author | casties |
---|---|
date | Tue, 20 Sep 2005 19:32:06 +0200 |
parents | 79c6618e8dfa |
children | c3defe3e2780 |
files | makemeta-vlp.pl |
diffstat | 1 files changed, 420 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/makemeta-vlp.pl Tue Sep 20 19:32:06 2005 +0200 @@ -0,0 +1,420 @@ +#!/usr/local/bin/perl -w + +use strict; +use XML::LibXML; + +use lib '/usr/local/mpiwg/archive_devel'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +# program version +my $version = "0.2 (19.9.2005 ROC)"; +my $help = +"use: makemeta-vlp [options] file.xml +options: + -debug show debugging info + -dry-run simulate, dont'do anything + -replace replace existing index files + -online-mode mode for creating online/permanent files + -archive-mode mode for creating archive/data files +"; +logger("INFO", "makemeta-vlp $version"); + +########################################### +# mappings + +# generic mappings at top level +my %gen_map = ( + 'Custom2_Language' => 'meta/lang' + ); +# sub type switch tag +my %type_map = ( + 'ReferenceType' => 'meta/bib@type' + ); +# sub type mappings +my %subtype_map = ( + 'Book' => { + '_name' => 'book', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'Pages' => 'meta/bib/number-of-pages' + }, + 'Book Section' => { + '_name' => 'inbook', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/book-title', + 'SecondaryAuthor' => 'meta/bib/editor', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'Pages' => 'meta/bib/pages' + }, + 'Edited Book' => { + '_name' => 'edited-book', + 'Author' => 'meta/bib/editor', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'Pages' => 'meta/bib/number-of-pages' + }, + 'Journal Article' => { + '_name' => 'journal-article', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'SecondaryTitle' => 'meta/bib/journal', + 'Volume' => 'meta/bib/volume', + 'Number_Issue' => 'meta/bib/issue', + 'Pages' => 'meta/bib/pages' + }, + 'Magazine Article' => { + '_name' => 'magazine-article', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/magazine', + 'Number_Issue' => 'meta/bib/issue-number', + 'Date' => 'meta/bib/issue-date', + 'Pages' => 'meta/bib/pages' + }, + 'Report' => { + '_name' => 'report', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Date' => 'meta/bib/date', + 'SecondaryTitle' => 'meta/bib/type', + 'Pages' => 'meta/bib/pages' + }, + 'Trade Catalogue' => { + '_name' => 'report', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Date' => 'meta/bib/date', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'ReferenceType' => 'meta/bib/type', + 'Pages' => 'meta/bib/pages' + }, + 'Thesis' => { + '_name' => 'thesis', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/university', + 'Date' => 'meta/bib/date', + 'TypeOfWork' => 'meta/bib/type', + 'Pages' => 'meta/bib/number-of-pages' + }, + 'Manuscript' => { + '_name' => 'manuscript', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/location', + 'Pages' => 'meta/bib/pages' + } + ); +# language element +my $lang_field = 'Custom2_Language'; +# languages to iso codes +my %lang_map = ( + 'German' => 'de', + 'English' => 'en', + 'Italian' => 'it', + 'French' => 'fr', + 'Latin' => 'la', + 'Japanese' => 'ja', + 'Dutch' => 'nl', + 'Spanish' => 'es', + 'Swedish' => 'sv' + ); +# storage fields +my $arch_id_field = 'ID'; + +####################################################### +# internal parameters +# + +# storage +my $lib_arch_dir = '/mpiwg/archive/data/vlp'; +my $lib_online_dir = '/mpiwg/online/permanent/vlp'; + +# read command line parameters +my $args = MPIWGStor::parseargs; +if (! scalar(%$args)) { + print $help, "\n"; + exit 1; +} + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# simulate action only +my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; +logger('DEBUG', "dry-run: $dry_run"); + +# replace existing index files +my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0; +logger('DEBUG', "replace: $do_replace"); + +# use online mode +my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; +logger('DEBUG', "online_mode: $online_mode"); + +# use archive mode +my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; +logger('DEBUG', "archive_mode: $archive_mode"); + +# index.meta namespace (not really implemented!) +my $namespace = ""; + + +my $xml_changed = 0; +my $errcnt = 0; +my $warncnt = 0; + +####################################################### +# check parameters that were passed to the program +# +my $infile = $$args{'path'}; +if (! $infile) { + logger("ABORT", "no input file given!"); + exit 1; +} +# strip double slashes +$infile =~ s/\/\//\//; +if (! -f $infile) { + logger("ABORT", "input file \'$infile\' doesn't exist!"); + exit 1; +} + + +####################################################### +# subroutines +# + + +sub find_arch_dir { + my ($input_node) = @_; + my $dir = ""; + + my $bib_id = $input_node->findvalue("fm:$arch_id_field"); + #logger('DEBUG', "bibdir: $bib_dir"); + if ($bib_id) { + $dir = "$lib_arch_dir/lit$bib_id"; + if (-d $dir) { + logger('DEBUG', "directory $dir exists"); + return $dir; + } + } + return; +} + +sub find_permanent_dir { + my ($input_node) = @_; + my $online_base = '/mpiwg/online/permanent'; + my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); + if (! $dest_id) { + logger('ERROR', "no ID field for online permanent entry"); + $errcnt++; + return; + } + my $dir = "$online_base/lit$dest_id"; + return $dir; +} + + +sub convert_bib { + my ($input_node, $index_root, $index_doc) = @_; + my $cnt = 0; + my $type = ""; + my $type_path = ""; + + # process general stuff first + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $gen_map{$name}) { + # is a general field + if ($name eq $lang_field) { + # language field + if (not $val) { + logger('WARNING', "no language tag"); + $warncnt++; + next; + } + # convert to iso code + if (exists $lang_map{$val}) { + $val = $lang_map{$val}; + } else { + logger('ERROR', "unknown language: $val! skipping..."); + $errcnt++; + return 0; + } + } + create_element_path($gen_map{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } elsif (exists $type_map{$name}) { + # is a type field + $type_path = $type_map{$name}; + $type = $val; + # check with known types + if (exists $subtype_map{$val}) { + my $indextype = $subtype_map{$val}->{'_name'}; + create_element_path("$type_path=$indextype", $index_root, $namespace); + $cnt++; + } else { + logger('ERROR', "unknown bib type $val! skipping..."); + $errcnt++; + return 0; + } + } + } + # process sub type fields + if ($type) { + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $subtype_map{$type}->{$name}) { + create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } + } + } + return $cnt; +} + + + +sub process_all_fm_entries { + my ($input_root) = @_; + my $cnt = 0; + + foreach my $n ($input_root->findnodes('fm:ROW')) { + logger('INFO', "processing entry $cnt ..."); + process_fm_entry($n); + $cnt++; + } +} + + +sub process_fm_entry { + my ($input_node) = @_; + my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); + my $index_root = $index_doc->createElementNS($namespace, 'resource'); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); + $index_doc->setDocumentElement($index_root); + + # try to find the document directory + my $doc_dir = ""; + if ($online_mode) { + $doc_dir = find_permanent_dir($input_node); + } elsif ($archive_mode) { + $doc_dir = find_arch_dir($input_node); + } else { + $doc_dir = find_permanent_dir($input_node); + } + if (! $doc_dir) { + logger('ERROR', "document directory not found! skipping..."); + $errcnt++; + return; + } + + # check if index.meta exists + if ( -f "$doc_dir/index.meta") { + if (not $do_replace) { + logger('DEBUG', "index file in $doc_dir exists"); + return; + } + } + + # add standard stuff to index.meta + my ($docname, $docpath) = split_file_path($doc_dir); + # name and date + create_text_path('name', $docname, $index_root, $namespace); + create_text_path('archive-path', $doc_dir, $index_root, $namespace); + create_text_path('archive-creation-date', stime(time), $index_root, $namespace); + create_text_path('creator', 'vlp', $index_root, $namespace); + create_text_path('description', 'a scanned document', $index_root, $namespace); + if ($archive_mode) { + # acquisition + create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); + create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); + create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); + # image acquisition + create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace); + create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace); + create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace); + } + # media + create_text_path('media-type', 'image', $index_root, $namespace); + create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); + + # convert bib entries + my $cnt = convert_bib($input_node, $index_root, $index_doc); + if ($cnt == 0) { + # error or nothing to convert + logger('ERROR', "no bibliographic metadata!"); + $errcnt++; + return; + } + + # write new index.meta file + if ($dry_run) { + logger('DEBUG', "would write $doc_dir/index.meta"); + logger('DEBUG', $index_doc->toString(1)); + } else { + write_xml($index_doc, "$doc_dir/index.meta"); + } + +} + + + + + +####################################################### +# Main +# + +# load filemaker xml dump +my ($input_doc, $input_root) = read_xml($infile); +# set namespace prefix +my $fm_namespace = $input_root->namespaceURI(); +$input_root->setNamespace($fm_namespace, 'fm', 1); + + +process_all_fm_entries($input_root); + + +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + logger("DONE", "done something successfully!"); +} +