changeset 21:c8e4e8cb31dd

new tool for createing index files for vlp documents
author casties
date Tue, 20 Sep 2005 19:32:06 +0200
parents 79c6618e8dfa
children c3defe3e2780
files makemeta-vlp.pl
diffstat 1 files changed, 420 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makemeta-vlp.pl	Tue Sep 20 19:32:06 2005 +0200
@@ -0,0 +1,420 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use XML::LibXML;
+
+use lib '/usr/local/mpiwg/archive_devel';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+# program version
+my $version = "0.2 (19.9.2005 ROC)";
+my $help = 
+"use: makemeta-vlp [options] file.xml
+options:
+  -debug  show debugging info
+  -dry-run  simulate, dont'do anything
+  -replace  replace existing index files
+  -online-mode  mode for creating online/permanent files
+  -archive-mode  mode for creating archive/data files
+";
+logger("INFO", "makemeta-vlp $version");
+
+###########################################
+# mappings
+
+# generic mappings at top level
+my %gen_map = (
+    'Custom2_Language' => 'meta/lang'
+    );
+# sub type switch tag
+my %type_map = (
+    'ReferenceType' => 'meta/bib@type'
+    );
+# sub type mappings
+my %subtype_map = (
+    'Book' => {
+	'_name' => 'book',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Publisher' => 'meta/bib/publisher',
+	'Edition' => 'meta/bib/edition',
+	'Volume' => 'meta/bib/volume',
+	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
+	'Pages' => 'meta/bib/number-of-pages'
+    },
+    'Book Section' => {
+	'_name' => 'inbook',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Secondary_Title' => 'meta/bib/book-title',
+	'SecondaryAuthor' => 'meta/bib/editor',
+	'Volume' => 'meta/bib/volume',
+	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Edited Book' => {
+	'_name' => 'edited-book',
+	'Author' => 'meta/bib/editor',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Publisher' => 'meta/bib/publisher',
+	'Edition' => 'meta/bib/edition',
+	'Volume' => 'meta/bib/volume',
+	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
+	'Pages' => 'meta/bib/number-of-pages'
+    },
+    'Journal Article' => {
+	'_name' => 'journal-article',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'SecondaryTitle' => 'meta/bib/journal',
+	'Volume' => 'meta/bib/volume',
+	'Number_Issue' => 'meta/bib/issue',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Magazine Article' => {
+	'_name' => 'magazine-article',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Secondary_Title' => 'meta/bib/magazine',
+	'Number_Issue' => 'meta/bib/issue-number',
+	'Date' => 'meta/bib/issue-date',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Report' => {
+	'_name' => 'report',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Date' => 'meta/bib/date',
+	'SecondaryTitle' => 'meta/bib/type',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Trade Catalogue' => {
+	'_name' => 'report',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Date' => 'meta/bib/date',
+	'Volume' => 'meta/bib/volume',
+	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
+	'ReferenceType' => 'meta/bib/type',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Thesis' => {
+	'_name' => 'thesis',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Place_Published' => 'meta/bib/city',
+	'Publisher' => 'meta/bib/university',
+	'Date' => 'meta/bib/date',
+	'TypeOfWork' => 'meta/bib/type',
+	'Pages' => 'meta/bib/number-of-pages'
+    },
+    'Manuscript' => {
+	'_name' => 'manuscript',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/location',
+	'Pages' => 'meta/bib/pages'
+    }
+    );
+# language element
+my $lang_field = 'Custom2_Language';
+# languages to iso codes
+my %lang_map = (
+    'German' => 'de',
+    'English' => 'en',
+    'Italian' => 'it',
+    'French' => 'fr',
+    'Latin' => 'la',
+    'Japanese' => 'ja',
+    'Dutch' => 'nl',
+    'Spanish' => 'es',
+    'Swedish' => 'sv'
+    );
+# storage fields
+my $arch_id_field = 'ID';
+
+#######################################################
+# internal parameters
+#
+
+# storage
+my $lib_arch_dir = '/mpiwg/archive/data/vlp';
+my $lib_online_dir = '/mpiwg/online/permanent/vlp';
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+if (! scalar(%$args)) {
+    print $help, "\n";
+    exit 1;
+}
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# simulate action only
+my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
+logger('DEBUG', "dry-run: $dry_run");
+
+# replace existing index files
+my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
+logger('DEBUG', "replace: $do_replace");
+
+# use online mode
+my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
+logger('DEBUG', "online_mode: $online_mode");
+
+# use archive mode
+my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
+logger('DEBUG', "archive_mode: $archive_mode");
+
+# index.meta namespace (not really implemented!)
+my $namespace = "";
+
+
+my $xml_changed = 0;
+my $errcnt = 0;
+my $warncnt = 0;
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $infile = $$args{'path'};
+if (! $infile) {
+    logger("ABORT", "no input file given!");
+    exit 1;
+}
+# strip double slashes
+$infile =~ s/\/\//\//;
+if (! -f $infile) {
+    logger("ABORT", "input file \'$infile\' doesn't exist!");
+    exit 1;
+}
+
+
+#######################################################
+# subroutines
+#
+
+
+sub find_arch_dir {
+    my ($input_node) = @_;
+    my $dir = "";
+
+    my $bib_id = $input_node->findvalue("fm:$arch_id_field");
+    #logger('DEBUG', "bibdir: $bib_dir");
+    if ($bib_id) {
+	$dir = "$lib_arch_dir/lit$bib_id";
+	if (-d $dir) {
+	    logger('DEBUG', "directory $dir exists"); 
+	    return $dir;
+	}
+    }
+    return;
+}
+
+sub find_permanent_dir {
+    my ($input_node) = @_;
+    my $online_base = '/mpiwg/online/permanent';
+    my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
+    if (! $dest_id) {
+	logger('ERROR', "no ID field for online permanent entry");
+	$errcnt++;
+	return;
+    }
+    my $dir = "$online_base/lit$dest_id";
+    return $dir;
+}
+
+
+sub convert_bib {
+    my ($input_node, $index_root, $index_doc) = @_;
+    my $cnt = 0;
+    my $type = "";
+    my $type_path = "";
+
+    # process general stuff first
+    foreach my $n ($input_node->getChildNodes()) {
+	my $name = $n->nodeName();
+	my $val = $n->textContent();
+	#logger('DEBUG', "  NODE: $name = '$val'");
+	if (exists $gen_map{$name}) {
+	    # is a general field
+	    if ($name eq $lang_field) {
+		# language field
+		if (not $val) {
+		    logger('WARNING', "no language tag");
+		    $warncnt++;
+		    next;
+		}
+		# convert to iso code
+		if (exists $lang_map{$val}) {
+		    $val = $lang_map{$val};
+		} else {
+		    logger('ERROR', "unknown language: $val! skipping...");
+		    $errcnt++;
+		    return 0;
+		}
+	    }
+	    create_element_path($gen_map{$name}, $index_root, $namespace)
+		->appendTextNode($val);
+	    $cnt++;
+	} elsif (exists $type_map{$name}) {
+	    # is a type field
+	    $type_path = $type_map{$name};
+	    $type = $val;
+	    # check with known types
+	    if (exists $subtype_map{$val}) {
+		my $indextype = $subtype_map{$val}->{'_name'};
+		create_element_path("$type_path=$indextype", $index_root, $namespace);
+		$cnt++;
+	    } else {
+		logger('ERROR', "unknown bib type $val! skipping...");
+		$errcnt++;
+		return 0;
+	    }
+	}
+    }
+    # process sub type fields
+    if ($type) {
+	foreach my $n ($input_node->getChildNodes()) {
+	    my $name = $n->nodeName();
+	    my $val = $n->textContent();
+	    #logger('DEBUG', "  NODE: $name = '$val'");
+	    if (exists $subtype_map{$type}->{$name}) {
+		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
+		    ->appendTextNode($val);
+		$cnt++;
+	    }
+	}
+    }
+    return $cnt;
+}
+
+
+
+sub process_all_fm_entries {
+    my ($input_root) = @_;
+    my $cnt = 0;
+
+    foreach my $n ($input_root->findnodes('fm:ROW')) {
+	logger('INFO', "processing entry $cnt ...");
+	process_fm_entry($n);
+	$cnt++;
+    }
+}    
+
+
+sub process_fm_entry {
+    my ($input_node) = @_;
+    my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
+    my $index_root = $index_doc->createElementNS($namespace, 'resource');
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
+    $index_doc->setDocumentElement($index_root);
+
+    # try to find the document directory
+    my $doc_dir = "";
+    if ($online_mode) {
+	$doc_dir = find_permanent_dir($input_node);
+    } elsif ($archive_mode) {
+	$doc_dir = find_arch_dir($input_node);
+    } else {
+	$doc_dir = find_permanent_dir($input_node);
+    }
+    if (! $doc_dir) {
+	logger('ERROR', "document directory not found! skipping...");
+	$errcnt++;
+	return;
+    }
+
+    # check if index.meta exists
+    if ( -f "$doc_dir/index.meta") {
+	if (not $do_replace) {
+	    logger('DEBUG', "index file in $doc_dir exists");
+	    return;
+	}
+    }
+
+    # add standard stuff to index.meta
+    my ($docname, $docpath) = split_file_path($doc_dir);
+    # name and date
+    create_text_path('name', $docname, $index_root, $namespace);
+    create_text_path('archive-path', $doc_dir, $index_root, $namespace);
+    create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
+    create_text_path('creator', 'vlp', $index_root, $namespace);
+    create_text_path('description', 'a scanned document', $index_root, $namespace);
+    if ($archive_mode) {
+      # acquisition
+      create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
+      create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
+      create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
+      # image acquisition
+      create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
+      create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
+      create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
+    }
+    # media
+    create_text_path('media-type', 'image', $index_root, $namespace);
+    create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
+
+    # convert bib entries
+    my $cnt = convert_bib($input_node, $index_root, $index_doc);
+    if ($cnt == 0) {
+	# error or nothing to convert
+	logger('ERROR', "no bibliographic metadata!");
+	$errcnt++;
+	return;
+    }
+
+    # write new index.meta file
+    if ($dry_run) {
+	logger('DEBUG', "would write $doc_dir/index.meta");
+	logger('DEBUG', $index_doc->toString(1));
+    } else {
+	write_xml($index_doc, "$doc_dir/index.meta");
+    }
+
+}
+
+
+
+
+
+#######################################################
+# Main
+#
+
+# load filemaker xml dump
+my ($input_doc, $input_root) = read_xml($infile);
+# set namespace prefix
+my $fm_namespace = $input_root->namespaceURI();
+$input_root->setNamespace($fm_namespace, 'fm', 1);
+
+
+process_all_fm_entries($input_root);
+
+
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    logger("DONE", "done something successfully!");
+}
+