Mercurial > hg > foxridge-archiver
changeset 60:5bee75ca9eb3 default tip
added old makemeta-quantum.pl that was not in CVS.
author | casties |
---|---|
date | Thu, 16 Mar 2017 18:29:58 +0100 |
parents | 975c4dbcd192 |
children | |
files | makemeta-quantum.pl |
diffstat | 1 files changed, 449 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/makemeta-quantum.pl Thu Mar 16 18:29:58 2017 +0100 @@ -0,0 +1,449 @@ +#!/usr/bin/perl -w + +use strict; +use XML::LibXML; + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; + +# make output unbuffered +$|=1; + +# program version +my $version = "0.1.2 (24.7.2008 ROC)"; +my $help = +"use: makemeta-quantum [options] file.xml +options: + -debug show debugging info + -dry-run simulate, dont'do anything + -online-mode mode for creating online/permanent files + -online-base=dir base directory for online ids (for online mode) + -access=free adds free access tag (use access=mpiwg for restricted access) +"; +logger("INFO", "makemeta-quantum $version"); + +########################################### +# mappings + +# generic mappings at top level +my %gen_map = ( + 'Language' => 'meta/lang' + ); +# sub type switch tag +my %type_map = ( + 'Index_Meta_Type' => 'meta/bib@type' + ); +# sub type mappings +# Correspondence (748) +# Other Archival Material (4) +# Published (11) +# Published Document (10) +# Unpublished Writing (1373) +my %subtype_map = ( + 'Correspondence' => { + '_name' => 'correspondence', + 'Document_Subtype' => 'meta/bib/type', + 'Author' => 'meta/bib/author', + 'Person_to' => 'meta/bib/recipient', + 'Title' => 'meta/bib/title', + 'Date_range_from' => 'meta/bib/date', + 'Date_range_to' => 'meta/bib/date-range-end', + 'Date_original_form' => 'meta/bib/date-original', + 'Place_From' => 'meta/bib/place', + 'First_lines' => 'meta/bib/incipit', + 'Call_number' => 'meta/bib/signature', + 'Call_number_original' => 'meta/bib/call-number', + 'Holding_institution' => 'meta/bib/holding-library', + 'Keywords' => 'meta/bib/description' + }, + 'Published' => { + '_name' => 'manuscript', + 'Document_Subtype' => 'meta/bib/type', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Date_range_from' => 'meta/bib/date', + 'Date_range_to' => 'meta/bib/date-range-end', + 'Date_original_form' => 'meta/bib/date-original', + 'Place_From' => 'meta/bib/location', + 'First_lines' => 'meta/bib/incipit', + 'Call_number' => 'meta/bib/signature', + 'Call_number_original' => 'meta/bib/call-number', + 'Holding_institution' => 'meta/bib/holding-library', + 'Keywords' => 'meta/bib/description' + }, + 'Published Document' => { + '_name' => 'manuscript', + 'Document_Subtype' => 'meta/bib/type', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Date_range_from' => 'meta/bib/date', + 'Date_range_to' => 'meta/bib/date-range-end', + 'Date_original_form' => 'meta/bib/date-original', + 'Place_From' => 'meta/bib/location', + 'First_lines' => 'meta/bib/incipit', + 'Call_number' => 'meta/bib/signature', + 'Call_number_original' => 'meta/bib/call-number', + 'Holding_institution' => 'meta/bib/holding-library', + 'Keywords' => 'meta/bib/description' + }, + 'Unpublished Writing' => { + '_name' => 'manuscript', + 'Document_Subtype' => 'meta/bib/type', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Date_range_from' => 'meta/bib/date', + 'Date_range_to' => 'meta/bib/date-range-end', + 'Date_original_form' => 'meta/bib/date-original', + 'Place_From' => 'meta/bib/location', + 'First_lines' => 'meta/bib/incipit', + 'Call_number' => 'meta/bib/signature', + 'Call_number_original' => 'meta/bib/call-number', + 'Holding_institution' => 'meta/bib/holding-library', + 'Keywords' => 'meta/bib/description' + }, + ); +# language element +my $lang_field = 'Language'; +# languages to iso codes +my %lang_map = ( + 'German' => 'de', + 'English' => 'en', + 'Italian' => 'it', + 'French' => 'fr', + 'Latin' => 'la', + 'Japanese' => 'ja', + 'Dutch' => 'nl', + 'Spanish' => 'es', + 'Swedish' => 'sv' + ); +# storage fields +my $online_url_field = 'URL'; +my $online_path_field = 'Path_images'; +my $id_field = 'ID'; + +####################################################### +# internal parameters +# + +# storage +my $lib_online_dir = '/mpiwg/online'; +my $lib_digilib_path = 'permanent'; + +# read command line parameters +my $args = MPIWGStor::parseargs; +if (! scalar(%$args)) { + print $help, "\n"; + exit 1; +} + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# simulate action only +my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; +logger('DEBUG', "dry-run: $dry_run"); + +# use online mode +my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; +logger('DEBUG', "online_mode: $online_mode"); +# online base dir +my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "/mpiwg/online"; +logger('DEBUG', "online_base_dir: $online_base_dir"); +# create texttool tag +my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; +logger('DEBUG', "texttool: $texttool"); +# image dir for texttool +my $texttool_img_dir = "pageimg"; +# fulltext directory pattern for texttool +my $texttool_fulltext_glob = "fulltext-*/*.xml"; +# pagebreak tag for texttool +my $texttool_pb_tag = "pb"; +# xslt for texttool +my $texttool_xslt = "/mpiwg/online/permanent/echo/quantum_project/hr-ms/schlick_correspondence.xsl"; +# digiliburlprefix +my $texttool_dlurlprefix = "http://echo.mpiwg-berlin.mpg.de/zogilib?"; + +# access type +my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; + +# index.meta namespace (not really implemented!) +my $namespace = ""; + + +my $xml_changed = 0; +my $errcnt = 0; +my $warncnt = 0; +my $filecnt = 0; + +####################################################### +# check parameters that were passed to the program +# +my $infile = $$args{'path'}; +if (! $infile) { + logger("ABORT", "no input file given!"); + exit 1; +} +# strip double slashes +$infile = sstrip($infile, 1); +if (! -f $infile) { + logger("ABORT", "input file \'$infile\' doesn't exist!"); + exit 1; +} + + +####################################################### +# subroutines +# + + +sub find_online_path { + my ($input_node) = @_; + my $online_path = sstrip($input_node->findvalue("fm:$online_path_field")); + my $id = sstrip($input_node->findvalue("fm:$id_field")); + # try online_base_dir + online_path first + if (($online_base_dir)&&($online_path)) { + my $dir = sstrip("$online_base_dir/$online_path", 1); + $dir =~ s/\/index.meta//; + if ( -d $dir ) { + return $dir; + } else { + logger('ERROR', "online path '$dir' not found! ($id)"); + $errcnt++; + return; + } + } + logger('ERROR', "online path not found! ($id)"); + $errcnt++; + return; +} + + + +# +# $dir = find_online_dir($input_node, $base_dir, $page_dir) +# +# Takes the path from the $online_url_field of the $input_node document +# and looks in the directory $base_dir for it. Strips $page_dir from the end. +# Returns the directory path sans $base_dir if it exists +# +sub find_online_dir { + my ($input_node, $base_dir, $page_dir) = @_; + $base_dir = $lib_online_dir unless ($base_dir); + + my $online_url = $input_node->findvalue("fm:$online_url_field"); + logger('DEBUG', "checking URL: $online_url"); + my $online_dir; + if ($online_url =~ /fn=permanent\/(.+)/) { + # new style digilib URL + $online_dir = $1; + } elsif ($online_url =~ /\?([^\+]+)\+/) { + # old style digilib URL + $online_dir = $1; + } + #logger('DEBUG', "online_dir1: $online_dir"); + if ($online_dir) { + $online_dir =~ s/\/$//; # strip ending slashes + if ($page_dir) { + # strip page_dir + $online_dir =~ s/\/${page_dir}$//; + } + #logger("DEBUG", "dir: $base_dir/$online_dir"); + if (-d "$base_dir/$online_dir") { + logger('DEBUG', "directory $base_dir/$online_dir exists"); + return $online_dir; + } + } + return; +} + + + +sub convert_bib { + my ($input_node, $index_root, $index_doc) = @_; + my $cnt = 0; + my $type = ""; + my $type_path = ""; + + # process general stuff first + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $gen_map{$name}) { + # is a general field + if ($name eq $lang_field) { + # language field -> convert to iso code + if (exists $lang_map{$val}) { + $val = $lang_map{$val}; + } else { + logger('WARNING', "unknown language: $val! ignoring..."); + $warncnt++; + next; + } + } + create_element_path($gen_map{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } elsif (exists $type_map{$name}) { + # is a type field + $type_path = $type_map{$name}; + $type = $val; + # check with known types + if (exists $subtype_map{$val}) { + my $indextype = $subtype_map{$val}->{'_name'}; + create_element_path("$type_path=$indextype", $index_root, $namespace); + $cnt++; + } else { + logger('ERROR', 'unknown bib type $val! skipping...'); + $errcnt++; + return 0; + } + } + } + # process sub type fields + if ($type) { + foreach my $n ($input_node->getChildNodes()) { + my $name = $n->nodeName(); + my $val = $n->textContent(); + #logger('DEBUG', " NODE: $name = '$val'"); + if (exists $subtype_map{$type}->{$name}) { + create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) + ->appendTextNode($val); + $cnt++; + } + } + } + return $cnt; +} + + + +sub process_all_fm_entries { + my ($input_root) = @_; + my $cnt = 0; + + foreach my $n ($input_root->findnodes('fm:ROW')) { + logger('INFO', "processing entry $cnt ..."); + process_fm_entry($n); + $cnt++; + } +} + + +sub process_fm_entry { + my ($input_node) = @_; + my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); + my $index_root = $index_doc->createElementNS($namespace, 'resource'); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); + $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); + $index_doc->setDocumentElement($index_root); + my $derived_from = ""; + + # try to find the document directory + my $doc_dir = ""; + if ($online_mode) { + $doc_dir = find_online_path($input_node); + #$derived_from = find_arch_dir($input_node); + } + if (! (($doc_dir) && (-d $doc_dir))) { + logger('ERROR', "document directory not found! skipping..."); + $errcnt++; + return; + } + + # add standard stuff to index.meta + my ($docname, $docpath) = split_file_path($doc_dir); + # name and date + create_text_path('name', $docname, $index_root, $namespace); + create_text_path('archive-path', $doc_dir, $index_root, $namespace); + create_text_path('archive-creation-date', stime(time), $index_root, $namespace); + create_text_path('creator', 'quantum-history group', $index_root, $namespace); + create_text_path('description', 'a scanned document', $index_root, $namespace); + # media + create_text_path('media-type', 'image', $index_root, $namespace); + create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); + # derived-from + if ($derived_from) { + create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace); + } + # access + if ($access_type) { + if ($access_type eq "free") { + create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); + } else { + my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); + create_text_path('name', $access_type, $acc_tag, $namespace); + } + } + # texttool tag with image dir + if ($texttool) { + create_text_path('meta/texttool/display', 'yes', $index_root, $namespace); + if ( -d "$doc_dir/$texttool_img_dir" ) { + create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); + } else { + logger('WARNING', "page image directory missing!"); + $warncnt++; + } + # check for fulltext + my @ftds = glob "$doc_dir/$texttool_fulltext_glob"; + if (@ftds) { + @ftds = sort @ftds; + my $ftd = $ftds[$#ftds]; + create_text_path('meta/texttool/text', $ftd, $index_root, $namespace); + if (scalar @ftds > 1) { + logger('WARNING', "more than one fulltext! Chose ${ftd}."); + $warncnt++; + } + create_text_path('meta/texttool/pagebreak', $texttool_pb_tag, $index_root, $namespace); + create_text_path('meta/texttool/xslt', $texttool_xslt, $index_root, $namespace); + #create_text_path('meta/texttool/digiliburlprefix', $texttool_dlurlprefix, $index_root, $namespace); + } + } + + # convert bib entries + my $cnt = convert_bib($input_node, $index_root, $index_doc); + if ($cnt == 0) { + # error or nothing to convert + logger('ERROR', "no bibliographic metadata!"); + $errcnt++; + return; + } + + # write new index.meta file + $filecnt++; + if ($dry_run) { + logger('DEBUG', "would write $doc_dir/index.meta"); + logger('DEBUG', $index_doc->toString(1)); + } else { + write_xml($index_doc, "$doc_dir/index.meta"); + } + +} + + + + + +####################################################### +# Main +# + +# load filemaker xml dump +my ($input_doc, $input_root) = read_xml($infile); +# set namespace prefix +my $fm_namespace = $input_root->namespaceURI(); +$input_root->setNamespace($fm_namespace, 'fm', 1); + +process_all_fm_entries($input_root); + + +logger("INFO", "$filecnt files written"); +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + logger("DONE", "done something successfully!"); +} +