changeset 12:620aad237f57

new tool makemeta-lib.pl to create index meta entries from the libraries online sources database
author casties
date Thu, 26 May 2005 16:31:28 +0200
parents b19df18aa19a
children f33fb7f1898e
files MPIWGStor.pm makemeta-lib.pl
diffstat 2 files changed, 424 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/MPIWGStor.pm	Wed Mar 23 13:12:08 2005 +0100
+++ b/MPIWGStor.pm	Thu May 26 16:31:28 2005 +0200
@@ -1,4 +1,3 @@
-#!/usr/local/bin/perl -w
 
 #
 # Library with definitions and common routines for MPIWG storage system
@@ -13,14 +12,15 @@
 	    $file_perm);
 use Exporter;
 
-$VERSION = 0.4; #  ROC 20.1.2005
+$VERSION = 0.6; #  ROC 26.5.2005
 
 @ISA = qw(Exporter);
 
 @EXPORT = qw($debug %junk_files %index_files $index_file_perm $archived_index_file_perm 
 	     $file_perm
 	     &parseargs &logger &stime  &s2stime &ymd_date &split_file_path &sstrip 
-	     &valid_file_name &valid_dir_name &park_file &unpark_file &read_xml &write_xml);
+	     &valid_file_name &valid_dir_name &park_file &unpark_file
+             &read_xml &write_xml &add_text_element &create_element_path &create_text_path);
 
 # debug level
 $debug = 0;
@@ -264,15 +264,15 @@
 	exit 1;
     }
 
-    logger("INFO", "index file: $file");
+    logger("DEBUG", "index file: $file");
     eval { $document = $parser->parse_file($file) };
     # catch parsing errors
     if ($@) {
-	logger("ABORT", "XML syntax error in index file!!");
+	logger("ABORT", "XML syntax error in file $file!!");
 	exit 1;
     }
     my $rootnode = $document->documentElement;
-    logger("INFO", "root element: \'" . $rootnode->nodeName . "\'");
+    logger("DEBUG", "root element: \'" . $rootnode->nodeName . "\'");
 
     return ($document, $rootnode);
 }
@@ -293,7 +293,8 @@
 	}
     }
     # write new file
-    if ($document->toFile($filename)) {
+    my ($fn, $dir) = split_file_path($filename);
+    if (-d $dir && $document->toFile($filename)) {
 	logger("INFO", "written new file $filename");
 	chmod $index_file_perm, $filename;
     } else {
@@ -302,6 +303,71 @@
     }
 }
 
+#
+# $elem = add_text_element($node, $name, $value, $namespace)
+#
+# creates an XML element with the name $name and the text content
+# $value attached to the node $node and returns it.
+#
+sub add_text_element {
+    my ($node, $name, $value, $namespace) = @_;
+
+    my $doc = $node->ownerDocument;
+    my $text = $doc->createTextNode($value);
+    my $newnode = $node->addNewChild($namespace, $name);
+    $newnode->addChild($text);
+
+    return $newnode;
+}
+
+#
+# $elem = create_element_path($path, $root, $namespace)
+#
+# creates and returns a DOM element at the given path from the 
+# given root. path is e.g. meta/bib@type=book. elements are separated
+# by /, an additional attribute can be specified after the @.
+#
+sub create_element_path {
+    my ($path, $root, $namespace) = @_;
+    my $attribute = "";
+    # get attribute
+    if ($path =~ /^(.+)@(.+)$/) {
+	$attribute = $2;
+	$path = $1;
+    }
+    my $point = $root;
+    for my $p (split /\//, $path) {
+	# check if the next path element exists
+	my $n = ($point->findnodes($p))[0];
+	if ($n) {
+	    $point = $n;
+	} else {
+	    # create if it doesn't exist
+	    $point = $point->addNewChild($namespace, $p);
+	}
+    }
+    # add the attribute
+    if ($attribute) {
+	my $dom = $root->getOwner();
+	my ($attkey, $attval) = split /=/, $attribute; #/ silly fontlock...
+	$point->addChild($dom->createAttributeNS($namespace, $attkey, $attval));
+    }
+    return $point;
+}
+
+#
+# $elem = create_text_path($path, $text, $root, $namespace)
+#
+# creates and returns a DOM text element with the given content at the
+# given path from the given root.
+#
+sub create_text_path {
+    my ($path, $text, $root, $namespace) = @_;
+    my $elem = create_element_path($path, $root, $namespace)->appendTextNode($text);
+    return $elem;
+}
+
+
 
 # module init
 return 1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makemeta-lib.pl	Thu May 26 16:31:28 2005 +0200
@@ -0,0 +1,351 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use XML::LibXML;
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.1.0 (24.5.2005)";
+logger("INFO", "makemeta-lib $version");
+
+#
+# mappings
+#
+# generic mappings at top level
+my %gen_map = (
+    'Device' => 'meta/image-acquisition/device',
+    'Image_Type' => 'meta/image-acquisition/image-type',
+    'Production_Comment' => 'meta/image-acquisition/production-comment',
+    'Postproduction' => 'meta/image-acquisition/production-comment',
+    'Language' => 'meta/lang'
+    );
+# sub type switch tag
+my %type_map = (
+    'Reference_Type' => 'meta/bib@type'
+    );
+# sub type mappings
+my %subtype_map = (
+    'Book' => {
+	'_name' => 'book',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Publisher' => 'meta/bib/publisher',
+	'Edition' => 'meta/bib/edition'
+    },
+    'Journal Article' => {
+	'_name' => 'journal-article',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Secondary_Title' => 'meta/bib/journal',
+	'Volume' => 'meta/bib/volume',
+	'Number' => 'meta/bib/issue',
+	'Pages' => 'meta/bib/pages'
+    },
+    'In Book' => {
+	'_name' => 'inbook',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Secondary_Title' => 'meta/bib/book-title',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Newspaper Article' => {
+	'_name' => 'newspaper-article',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Secondary_Title' => 'meta/bib/newspaper',
+	'Place_Published' => 'meta/bib/city',
+	'Number' => 'meta/bib/issue-date',
+	'Pages' => 'meta/bib/pages'
+    },
+    'Edited Book' => {
+	'_name' => 'edited-book',
+	'Author' => 'meta/bib/editor',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/city',
+	'Publisher' => 'meta/bib/publisher',
+	'Edition' => 'meta/bib/edition'
+    },
+    'Manuscript' => {
+	'_name' => 'manuscript',
+	'Author' => 'meta/bib/author',
+	'Title' => 'meta/bib/title',
+	'Year' => 'meta/bib/year',
+	'Place_Published' => 'meta/bib/location',
+    }
+    );
+# language element
+my $lang_field = 'Language';
+# languages to iso codes
+my %lang_map = (
+    'German' => 'de',
+    'English' => 'en',
+    'Italian' => 'it',
+    'French' => 'fr',
+    'Latin' => 'la'
+    );
+# storage fields
+my $arch_id_field = 'ID_Archive';
+my $online_url_field = 'URL';
+
+# more storage
+my $lib_arch_dir = '/mpiwg/archive/data/library';
+my $lib_online_dir = '/mpiwg/online/permanent';
+
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# use einstein-cw mode
+my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
+
+# index.meta namespace (not really implemented!)
+my $namespace = "";
+
+
+my $xml_changed = 0;
+my $errcnt = 0;
+my $warncnt = 0;
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $infile = $$args{'path'};
+if (! $infile) {
+    logger("ABORT", "no input file given!");
+    exit 1;
+}
+# strip double slashes
+$infile =~ s/\/\//\//;
+if (! -f $infile) {
+    logger("ABORT", "input file \'$infile\' doesn't exist!");
+    exit 1;
+}
+
+
+#######################################################
+# subroutines
+#
+
+sub find_cw_dir {
+    my ($input_node) = @_;
+    my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup');
+    my $dest_id = $input_node->findvalue("fm:$arch_id_field");
+    if (! $dest_id) {
+	logger('ERROR', "no ID field for einstein-cw entry");
+	$errcnt++;
+	return;
+    }
+    my $dir = "$lib_arch_dir/$dest_id";
+    logger('DEBUG', "moving $src_dir to $dir");
+    if (rename $src_dir, $dir) {
+	if (-d $dir) {
+	    logger('DEBUG', "directory $dir OK"); 
+	    return $dir;
+	}
+    } else {
+	logger('ABORT', "unable to rename directory $src_dir to $dir!");
+	exit 1;
+    }
+    return;
+}
+
+sub find_online_dir {
+    my ($input_node, $base_dir) = @_;
+    $base_dir = $lib_online_dir unless ($base_dir);
+
+    my $online_url = $input_node->findvalue("fm:$online_url_field");
+    if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) {
+	my $online_dir = $1;
+	#logger("DEBUG", "dir: $base_dir/$online_dir");
+	my $dir = "$base_dir/$online_dir";
+	if (-d $dir) {
+	    logger('DEBUG', "directory $dir exists"); 
+	    return $dir;
+	}
+    }
+    return;
+}
+
+sub find_arch_dir {
+    my ($input_node) = @_;
+    my $dir = "";
+
+    my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
+    #logger('DEBUG', "bibdir: $bib_dir");
+    if ($bib_dir) {
+	$dir = "$lib_arch_dir/$bib_dir";
+	if (-d $dir) {
+	    logger('DEBUG', "directory $dir exists"); 
+	    return $dir;
+	}
+    }
+    return;
+}
+
+
+sub convert_bib {
+    my ($input_node, $index_root, $index_doc) = @_;
+    my $cnt = 0;
+    my $type = "";
+    my $type_path = "";
+
+    # process general stuff first
+    foreach my $n ($input_node->getChildNodes()) {
+	my $name = $n->nodeName();
+	my $val = $n->textContent();
+	#logger('DEBUG', "  NODE: $name = '$val'");
+	if (exists $gen_map{$name}) {
+	    # is a general field
+	    if ($name eq $lang_field) {
+		# language field -> convert to iso code
+		if (exists $lang_map{$val}) {
+		    $val = $lang_map{$val};
+		} else {
+		    logger('ERROR', "unknown language: $val! skipping...");
+		    $errcnt++;
+		    return 0;
+		}
+	    }
+	    create_element_path($gen_map{$name}, $index_root, $namespace)
+		->appendTextNode($val);
+	    $cnt++;
+	} elsif (exists $type_map{$name}) {
+	    # is a type field
+	    $type_path = $type_map{$name};
+	    $type = $val;
+	    # check with known types
+	    if (exists $subtype_map{$val}) {
+		my $indextype = $subtype_map{$val}->{'_name'};
+		create_element_path("$type_path=$indextype", $index_root, $namespace);
+		$cnt++;
+	    } else {
+		logger('ERROR', 'unknown bib type $val! skipping...');
+		$errcnt++;
+		return 0;
+	    }
+	}
+    }
+    # process sub type fields
+    if ($type) {
+	foreach my $n ($input_node->getChildNodes()) {
+	    my $name = $n->nodeName();
+	    my $val = $n->textContent();
+	    #logger('DEBUG', "  NODE: $name = '$val'");
+	    if (exists $subtype_map{$type}->{$name}) {
+		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
+		    ->appendTextNode($val);
+		$cnt++;
+	    }
+	}
+    }
+    return $cnt;
+}
+
+
+
+sub process_all_fm_entries {
+    my ($input_root) = @_;
+    my $cnt = 0;
+
+    foreach my $n ($input_root->findnodes('fm:ROW')) {
+	logger('INFO', "processing entry $cnt ...");
+	process_fm_entry($n);
+    }
+}    
+
+
+sub process_fm_entry {
+    my ($input_node) = @_;
+    my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
+    my $index_root = $index_doc->createElementNS($namespace, 'resource');
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
+    $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
+    $index_doc->setDocumentElement($index_root);
+
+    # try to find the document directory
+    my $doc_dir = "";
+    if ($cw_mode) {
+	$doc_dir = find_cw_dir($input_node);
+    } else {
+	$doc_dir = find_arch_dir($input_node);
+    }
+    if (! $doc_dir) {
+	logger('ERROR', "document directory not found! skipping...");
+	$errcnt++;
+	return;
+    }
+
+    # add standard stuff to index.meta
+    my ($docname, $docpath) = split_file_path($doc_dir);
+    # name and date
+    create_text_path('name', $docname, $index_root, $namespace);
+    create_text_path('archive-path', $doc_dir, $index_root, $namespace);
+    create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
+    create_text_path('creator', 'digigroup', $index_root, $namespace);
+    create_text_path('description', 'a scanned document', $index_root, $namespace);
+    # acquisition
+    create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
+    create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
+    create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
+    # media
+    create_text_path('media-type', 'image', $index_root, $namespace);
+    create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
+
+    # convert bib entries
+    my $cnt = convert_bib($input_node, $index_root, $index_doc);
+    if ($cnt == 0) {
+	# error or nothing to convert
+	logger('ERROR', "no bibliographic metadata!");
+	$errcnt++;
+	return;
+    }
+
+    # write new index.meta file
+    write_xml($index_doc, "$doc_dir/index.meta");
+
+}
+
+
+
+
+
+#######################################################
+# Main
+#
+
+# load filemaker xml dump
+my ($input_doc, $input_root) = read_xml($infile);
+# set namespace prefix
+my $fm_namespace = $input_root->namespaceURI();
+$input_root->setNamespace($fm_namespace, 'fm', 1);
+
+process_all_fm_entries($input_root);
+
+
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    logger("DONE", "done something successfully!");
+}
+