changeset 60:5bee75ca9eb3 default tip

added old makemeta-quantum.pl that was not in CVS.
author casties
date Thu, 16 Mar 2017 18:29:58 +0100
parents 975c4dbcd192
children
files makemeta-quantum.pl
diffstat 1 files changed, 449 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makemeta-quantum.pl	Thu Mar 16 18:29:58 2017 +0100
@@ -0,0 +1,449 @@
+#!/usr/bin/perl -w
+
+use strict;
+use XML::LibXML;
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+# program version
+my $version = "0.1.2 (24.7.2008 ROC)";
+my $help = 
+"use: makemeta-quantum [options] file.xml
+options:
+  -debug  show debugging info
+  -dry-run  simulate, dont'do anything
+  -online-mode  mode for creating online/permanent files
+  -online-base=dir  base directory for online ids (for online mode)
+  -access=free  adds free access tag (use access=mpiwg for restricted access)
+";
+logger("INFO", "makemeta-quantum $version");
+
+###########################################
+# mappings
+
+# generic mappings at top level
+my %gen_map = (
+    'Language' => 'meta/lang'
+    );
+# sub type switch tag
+my %type_map = (
+    'Index_Meta_Type' => 'meta/bib@type'
+    );
+# sub type mappings
+# Correspondence (748)
+# Other Archival Material (4)
+# Published (11)
+# Published Document (10)
+# Unpublished Writing (1373)
+my %subtype_map = (
+    'Correspondence' => {
+	'_name' => 'correspondence',
+        'Document_Subtype' => 'meta/bib/type',
+	'Author' => 'meta/bib/author',
+	'Person_to' => 'meta/bib/recipient',
+	'Title' => 'meta/bib/title',
+	'Date_range_from' => 'meta/bib/date',
+	'Date_range_to' => 'meta/bib/date-range-end',
+	'Date_original_form' => 'meta/bib/date-original',
+	'Place_From' => 'meta/bib/place',
+	'First_lines' => 'meta/bib/incipit',
+        'Call_number' => 'meta/bib/signature',
+        'Call_number_original' => 'meta/bib/call-number',
+        'Holding_institution' => 'meta/bib/holding-library',
+        'Keywords' => 'meta/bib/description'
+    },
+    'Published' => {
+	'_name' => 'manuscript',
+        'Document_Subtype' => 'meta/bib/type',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Date_range_from' => 'meta/bib/date',
+	'Date_range_to' => 'meta/bib/date-range-end',
+	'Date_original_form' => 'meta/bib/date-original',
+	'Place_From' => 'meta/bib/location',
+	'First_lines' => 'meta/bib/incipit',
+        'Call_number' => 'meta/bib/signature',
+        'Call_number_original' => 'meta/bib/call-number',
+        'Holding_institution' => 'meta/bib/holding-library',
+        'Keywords' => 'meta/bib/description'
+    },
+    'Published Document' => {
+	'_name' => 'manuscript',
+        'Document_Subtype' => 'meta/bib/type',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Date_range_from' => 'meta/bib/date',
+	'Date_range_to' => 'meta/bib/date-range-end',
+	'Date_original_form' => 'meta/bib/date-original',
+	'Place_From' => 'meta/bib/location',
+	'First_lines' => 'meta/bib/incipit',
+        'Call_number' => 'meta/bib/signature',
+        'Call_number_original' => 'meta/bib/call-number',
+        'Holding_institution' => 'meta/bib/holding-library',
+        'Keywords' => 'meta/bib/description'
+    },
+    'Unpublished Writing' => {
+	'_name' => 'manuscript',
+        'Document_Subtype' => 'meta/bib/type',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Date_range_from' => 'meta/bib/date',
+	'Date_range_to' => 'meta/bib/date-range-end',
+	'Date_original_form' => 'meta/bib/date-original',
+	'Place_From' => 'meta/bib/location',
+	'First_lines' => 'meta/bib/incipit',
+        'Call_number' => 'meta/bib/signature',
+        'Call_number_original' => 'meta/bib/call-number',
+        'Holding_institution' => 'meta/bib/holding-library',
+        'Keywords' => 'meta/bib/description'
+    },
+    );
+# language element
+my $lang_field = 'Language';
+# languages to iso codes
+my %lang_map = (
+    'German' => 'de',
+    'English' => 'en',
+    'Italian' => 'it',
+    'French' => 'fr',
+    'Latin' => 'la',
+    'Japanese' => 'ja',
+    'Dutch' => 'nl',
+    'Spanish' => 'es',
+    'Swedish' => 'sv'
+    );
+# storage fields
+my $online_url_field = 'URL';
+my $online_path_field = 'Path_images';
+my $id_field = 'ID';
+
+#######################################################
+# internal parameters
+#
+
+# storage
+my $lib_online_dir = '/mpiwg/online';
+my $lib_digilib_path = 'permanent';
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+if (! scalar(%$args)) {
+    print $help, "\n";
+    exit 1;
+}
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# simulate action only
+my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
+logger('DEBUG', "dry-run: $dry_run");
+
+# use online mode
+my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
+logger('DEBUG', "online_mode: $online_mode");
+# online base dir
+my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "/mpiwg/online";
+logger('DEBUG', "online_base_dir: $online_base_dir");
+# create texttool tag
+my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1;
+logger('DEBUG', "texttool: $texttool");
+# image dir for texttool
+my $texttool_img_dir = "pageimg";
+# fulltext directory pattern for texttool
+my $texttool_fulltext_glob = "fulltext-*/*.xml";
+# pagebreak tag for texttool
+my $texttool_pb_tag = "pb";
+# xslt for texttool
+my $texttool_xslt = "/mpiwg/online/permanent/echo/quantum_project/hr-ms/schlick_correspondence.xsl";
+# digiliburlprefix
+my $texttool_dlurlprefix = "http://echo.mpiwg-berlin.mpg.de/zogilib?";
+
+# access type
+my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
+
+# index.meta namespace (not really implemented!)
+my $namespace = "";
+
+
+my $xml_changed = 0;
+my $errcnt = 0;
+my $warncnt = 0;
+my $filecnt = 0;
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $infile = $$args{'path'};
+if (! $infile) {
+    logger("ABORT", "no input file given!");
+    exit 1;
+}
+# strip double slashes
+$infile = sstrip($infile, 1);
+if (! -f $infile) {
+    logger("ABORT", "input file \'$infile\' doesn't exist!");
+    exit 1;
+}
+
+
+#######################################################
+# subroutines
+#
+
+
+sub find_online_path {
+    my ($input_node) = @_;
+    my $online_path = sstrip($input_node->findvalue("fm:$online_path_field"));
+    my $id = sstrip($input_node->findvalue("fm:$id_field"));
+    # try online_base_dir + online_path first
+    if (($online_base_dir)&&($online_path)) {
+	my $dir = sstrip("$online_base_dir/$online_path", 1);
+        $dir =~ s/\/index.meta//;
+        if ( -d $dir ) {
+            return $dir;
+        } else {
+            logger('ERROR', "online path '$dir' not found! ($id)");
+            $errcnt++;
+            return;
+        }
+    }
+    logger('ERROR', "online path not found! ($id)");
+    $errcnt++;
+    return;
+}
+
+
+
+#
+# $dir = find_online_dir($input_node, $base_dir, $page_dir)
+#
+# Takes the path from the $online_url_field of the $input_node document
+# and looks in the directory $base_dir for it. Strips $page_dir from the end.
+# Returns the directory path sans $base_dir if it exists
+#
+sub find_online_dir {
+    my ($input_node, $base_dir, $page_dir) = @_;
+    $base_dir = $lib_online_dir unless ($base_dir);
+
+    my $online_url = $input_node->findvalue("fm:$online_url_field");
+    logger('DEBUG', "checking URL: $online_url");
+    my $online_dir;
+    if ($online_url =~ /fn=permanent\/(.+)/) {
+	# new style digilib URL
+	$online_dir = $1;
+    } elsif ($online_url =~ /\?([^\+]+)\+/) {
+	# old style digilib URL
+	$online_dir = $1;
+    }
+    #logger('DEBUG', "online_dir1: $online_dir");
+    if ($online_dir) {
+	$online_dir =~ s/\/$//; # strip ending slashes
+	if ($page_dir) {
+	    # strip page_dir
+	    $online_dir =~ s/\/${page_dir}$//;
+	}
+	#logger("DEBUG", "dir: $base_dir/$online_dir");
+	if (-d "$base_dir/$online_dir") {
+	    logger('DEBUG', "directory $base_dir/$online_dir exists");
+	    return $online_dir;
+	}
+    }
+    return;
+}
+
+
+
+sub convert_bib {
+    my ($input_node, $index_root, $index_doc) = @_;
+    my $cnt = 0;
+    my $type = "";
+    my $type_path = "";
+
+    # process general stuff first
+    foreach my $n ($input_node->getChildNodes()) {
+	my $name = $n->nodeName();
+	my $val = $n->textContent();
+	#logger('DEBUG', "  NODE: $name = '$val'");
+	if (exists $gen_map{$name}) {
+	    # is a general field
+	    if ($name eq $lang_field) {
+		# language field -> convert to iso code
+		if (exists $lang_map{$val}) {
+		    $val = $lang_map{$val};
+		} else {
+		    logger('WARNING', "unknown language: $val! ignoring...");
+		    $warncnt++;
+                    next;
+		}
+	    }
+	    create_element_path($gen_map{$name}, $index_root, $namespace)
+		->appendTextNode($val);
+	    $cnt++;
+	} elsif (exists $type_map{$name}) {
+	    # is a type field
+	    $type_path = $type_map{$name};
+	    $type = $val;
+	    # check with known types
+	    if (exists $subtype_map{$val}) {
+		my $indextype = $subtype_map{$val}->{'_name'};
+		create_element_path("$type_path=$indextype", $index_root, $namespace);
+		$cnt++;
+	    } else {
+		logger('ERROR', 'unknown bib type $val! skipping...');
+		$errcnt++;
+		return 0;
+	    }
+	}
+    }
+    # process sub type fields
+    if ($type) {
+	foreach my $n ($input_node->getChildNodes()) {
+	    my $name = $n->nodeName();
+	    my $val = $n->textContent();
+	    #logger('DEBUG', "  NODE: $name = '$val'");
+	    if (exists $subtype_map{$type}->{$name}) {
+		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
+		    ->appendTextNode($val);
+		$cnt++;
+	    }
+	}
+    }
+    return $cnt;
+}
+
+
+
+sub process_all_fm_entries {
+    my ($input_root) = @_;
+    my $cnt = 0;
+
+    foreach my $n ($input_root->findnodes('fm:ROW')) {
+	logger('INFO', "processing entry $cnt ...");
+	process_fm_entry($n);
+	$cnt++;
+    }
+}    
+
+
+sub process_fm_entry {
+    my ($input_node) = @_;
+    my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
+    my $index_root = $index_doc->createElementNS($namespace, 'resource');
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
+    $index_doc->setDocumentElement($index_root);
+    my $derived_from = "";
+
+    # try to find the document directory
+    my $doc_dir = "";
+    if ($online_mode) {
+	$doc_dir = find_online_path($input_node);
+	#$derived_from = find_arch_dir($input_node);
+    }
+    if (! (($doc_dir) && (-d $doc_dir))) {
+	logger('ERROR', "document directory not found! skipping...");
+	$errcnt++;
+	return;
+    }
+
+    # add standard stuff to index.meta
+    my ($docname, $docpath) = split_file_path($doc_dir);
+    # name and date
+    create_text_path('name', $docname, $index_root, $namespace);
+    create_text_path('archive-path', $doc_dir, $index_root, $namespace);
+    create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
+    create_text_path('creator', 'quantum-history group', $index_root, $namespace);
+    create_text_path('description', 'a scanned document', $index_root, $namespace);
+    # media
+    create_text_path('media-type', 'image', $index_root, $namespace);
+    create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
+    # derived-from
+    if ($derived_from) {
+       create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
+    }
+    # access
+    if ($access_type) {
+	if ($access_type eq "free") {
+	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
+	} else {
+	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
+	    create_text_path('name', $access_type, $acc_tag, $namespace);
+	}
+    }
+    # texttool tag with image dir
+    if ($texttool) {
+        create_text_path('meta/texttool/display', 'yes', $index_root, $namespace);
+	if ( -d "$doc_dir/$texttool_img_dir" ) {
+	    create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace);
+	} else {
+            logger('WARNING', "page image directory missing!");
+            $warncnt++;
+        }
+        # check for fulltext
+        my @ftds = glob "$doc_dir/$texttool_fulltext_glob";
+        if (@ftds) {
+            @ftds = sort @ftds;
+            my $ftd = $ftds[$#ftds];
+	    create_text_path('meta/texttool/text', $ftd, $index_root, $namespace);
+            if (scalar @ftds > 1) {
+                logger('WARNING', "more than one fulltext! Chose ${ftd}.");
+                $warncnt++;
+            }
+            create_text_path('meta/texttool/pagebreak', $texttool_pb_tag, $index_root, $namespace);
+            create_text_path('meta/texttool/xslt', $texttool_xslt, $index_root, $namespace);
+            #create_text_path('meta/texttool/digiliburlprefix', $texttool_dlurlprefix, $index_root, $namespace);
+        }
+    }
+
+    # convert bib entries
+    my $cnt = convert_bib($input_node, $index_root, $index_doc);
+    if ($cnt == 0) {
+	# error or nothing to convert
+	logger('ERROR', "no bibliographic metadata!");
+	$errcnt++;
+	return;
+    }
+
+    # write new index.meta file
+    $filecnt++;
+    if ($dry_run) {
+	logger('DEBUG', "would write $doc_dir/index.meta");
+	logger('DEBUG', $index_doc->toString(1));
+    } else {
+	write_xml($index_doc, "$doc_dir/index.meta");
+    }
+
+}
+
+
+
+
+
+#######################################################
+# Main
+#
+
+# load filemaker xml dump
+my ($input_doc, $input_root) = read_xml($infile);
+# set namespace prefix
+my $fm_namespace = $input_root->namespaceURI();
+$input_root->setNamespace($fm_namespace, 'fm', 1);
+
+process_all_fm_entries($input_root);
+
+
+logger("INFO", "$filecnt files written");
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    logger("DONE", "done something successfully!");
+}
+