#!/usr/local/bin/perl -w
use strict;
use XML::LibXML;
use lib '/usr/local/mpiwg/archive';
use MPIWGStor;
# make output unbuffered
$|=1;
# program version
my $version = "0.2.6 (11.12.2006 ROC)";
my $help =
"use: makemeta-lib [options] file.xml
options:
-debug show debugging info
-dry-run simulate, dont'do anything
-online-mode mode for creating online/permanent files
-online-base=dir base directory for online ids (for online mode)
-cw-mode mode for copying einstein_cw archive documents
-digifiles-mode mode for copying files from digifiles
-map-file=mapfile.xml digilib mapping file (for digifiles mode)
-access=free adds free access tag (use access=mpiwg for restricted access)
";
logger("INFO", "makemeta-lib $version");
###########################################
# mappings
# generic mappings at top level
my %gen_map = (
'Device' => 'meta/image-acquisition/device',
'Image_Type' => 'meta/image-acquisition/image-type',
'Production_Comment' => 'meta/image-acquisition/production-comment',
'Postproduction' => 'meta/image-acquisition/production-comment',
'Language' => 'meta/lang'
);
# sub type switch tag
my %type_map = (
'Reference_Type' => 'meta/bib@type'
);
# sub type mappings
my %subtype_map = (
'Book' => {
'_name' => 'book',
'Author' => 'meta/bib/author',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Place_Published' => 'meta/bib/city',
'Publisher' => 'meta/bib/publisher',
'Edition' => 'meta/bib/edition'
},
'Journal Article' => {
'_name' => 'journal-article',
'Author' => 'meta/bib/author',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Secondary_Title' => 'meta/bib/journal',
'Volume' => 'meta/bib/volume',
'Number' => 'meta/bib/issue',
'Pages' => 'meta/bib/pages'
},
'In Book' => {
'_name' => 'inbook',
'Author' => 'meta/bib/author',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Secondary_Title' => 'meta/bib/book-title',
'Pages' => 'meta/bib/pages'
},
'Newspaper Article' => {
'_name' => 'newspaper-article',
'Author' => 'meta/bib/author',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Secondary_Title' => 'meta/bib/newspaper',
'Place_Published' => 'meta/bib/city',
'Number' => 'meta/bib/issue-date',
'Pages' => 'meta/bib/pages'
},
'Edited Book' => {
'_name' => 'edited-book',
'Author' => 'meta/bib/editor',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Place_Published' => 'meta/bib/city',
'Publisher' => 'meta/bib/publisher',
'Edition' => 'meta/bib/edition'
},
'Manuscript' => {
'_name' => 'manuscript',
'Author' => 'meta/bib/author',
'Title' => 'meta/bib/title',
'Year' => 'meta/bib/year',
'Place_Published' => 'meta/bib/location',
}
);
# language element
my $lang_field = 'Language';
# languages to iso codes
my %lang_map = (
'German' => 'de',
'English' => 'en',
'Italian' => 'it',
'French' => 'fr',
'Latin' => 'la',
'Japanese' => 'ja',
'Dutch' => 'nl',
'Spanish' => 'es',
'Swedish' => 'sv'
);
# storage fields
my $arch_id_field = 'ID_Archive';
my $online_url_field = 'URL';
my $online_id_field = 'ID_OnlinePermanent';
#######################################################
# internal parameters
#
# storage
my $lib_arch_dir = '/mpiwg/archive/data/library';
my $lib_online_dir = '/mpiwg/online/permanent';
my $lib_digilib_path = 'permanent';
my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
# read command line parameters
my $args = MPIWGStor::parseargs;
if (! scalar(%$args)) {
print $help, "\n";
exit 1;
}
# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
# simulate action only
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
logger('DEBUG', "dry-run: $dry_run");
# use online mode
my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
logger('DEBUG', "online_mode: $online_mode");
# online base dir
my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "";
logger('DEBUG', "online_base_dir: $online_base_dir");
# use einstein-cw mode
my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
logger('DEBUG', "cw_mode: $cw_mode");
# use digifiles mode
my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
logger('DEBUG', "digifiles_mode: $digifiles_mode");
# digilib mapping file
my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
logger('DEBUG', "map_file_name: $map_file_name");
my $mapping_doc;
my $mapping_root;
# access type
my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
# index.meta namespace (not really implemented!)
my $namespace = "";
my $xml_changed = 0;
my $errcnt = 0;
my $warncnt = 0;
#######################################################
# check parameters that were passed to the program
#
my $infile = $$args{'path'};
if (! $infile) {
logger("ABORT", "no input file given!");
exit 1;
}
# strip double slashes
$infile = sstrip($infile, 1);
if (! -f $infile) {
logger("ABORT", "input file \'$infile\' doesn't exist!");
exit 1;
}
#######################################################
# subroutines
#
sub add_digilib_mapping {
my ($src_dir, $dest_dir) = @_;
my $elem = $mapping_root->addNewChild($namespace, 'mapping');
$elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
$elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
if ($map_file_name) {
write_xml($mapping_doc, $map_file_name);
} else {
logger('ABORT', "unable to write mapping file!");
exit 1;
}
}
sub find_digifiles_dir {
my ($input_node) = @_;
my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
my $src_dir = find_online_dir($input_node, $digifiles_base, '');
if (! $src_dir) {
logger('ERROR', "no online directory for digifiles entry");
$errcnt++;
return;
}
my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
if (! $dest_id) {
logger('ERROR', "no ID field for digifiles entry");
$errcnt++;
return;
}
my $dir = "$lib_online_dir/library/$dest_id";
my $map_dir = "$lib_digilib_path/library/$dest_id";
if ($dry_run) {
logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
add_digilib_mapping($src_dir, "$map_dir/pageimg");
return $dir;
} else {
logger('INFO', "moving $digifiles_base/$src_dir to $dir");
logger('DEBUG', "mkdir $dir/pageimg");
if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg");
if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
if (-d "$dir/pageimg") {
logger('DEBUG', "directory $dir OK");
add_digilib_mapping($src_dir, "$map_dir/pageimg");
if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
logger('DEBUG', "directory $digifiles_base/$src_dir removed");
return $dir;
} else {
logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
$errcnt++;
return $dir;
}
}
}
}
logger('ABORT', "unable to copy directory $src_dir to $dir!");
exit 1;
}
return;
}
sub find_cw_dir {
my ($input_node) = @_;
my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
if (! $dest_id) {
logger('ERROR', "no ID field for einstein-cw entry");
$errcnt++;
return;
}
my $dir = "$lib_arch_dir/$dest_id";
if ($dry_run) {
logger('DEBUG', "would move $cw_base/$src_dir to $dir");
return $dir;
} else {
logger('DEBUG', "moving $cw_base/$src_dir to $dir");
if (rename "$cw_base/$src_dir", $dir) {
if (-d $dir) {
logger('DEBUG', "directory $dir OK");
return $dir;
}
} else {
logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
exit 1;
}
}
return;
}
sub find_permanent_dir {
my ($input_node) = @_;
my $online_id = sstrip($input_node->findvalue("fm:$online_id_field"));
# try online_base_dir + online_id first
if (($online_base_dir)&&($online_id)) {
my $dir = sstrip("$online_base_dir/$online_id", 1);
return $dir;
}
# then online_url
my $online_base = '/mpiwg/online/permanent';
my $online_dir = find_online_dir($input_node, $online_base, 'pageimg');
if ((! $online_dir)) {
logger('ERROR', "no ID or URL for online permanent entry");
$errcnt++;
return;
}
my $dir = sstrip("$online_base/$online_dir", 1);
return $dir;
}
#
# $dir = find_online_dir($input_node, $base_dir, $page_dir)
#
# Takes the path from the $online_url_field of the $input_node document
# and looks in the directory $base_dir for it. Strips $page_dir from the end.
# Returns the directory path sans $base_dir if it exists
#
sub find_online_dir {
my ($input_node, $base_dir, $page_dir) = @_;
$base_dir = $lib_online_dir unless ($base_dir);
my $online_url = $input_node->findvalue("fm:$online_url_field");
logger('DEBUG', "checking URL: $online_url");
my $online_dir;
if ($online_url =~ /fn=permanent\/(.+)/) {
# new style digilib URL
$online_dir = $1;
} elsif ($online_url =~ /\?([^\+]+)\+/) {
# old style digilib URL
$online_dir = $1;
}
#logger('DEBUG', "online_dir1: $online_dir");
if ($online_dir) {
$online_dir =~ s/\/$//; # strip ending slashes
if ($page_dir) {
# strip page_dir
$online_dir =~ s/\/${page_dir}$//;
}
#logger("DEBUG", "dir: $base_dir/$online_dir");
if (-d "$base_dir/$online_dir") {
logger('DEBUG', "directory $base_dir/$online_dir exists");
return $online_dir;
}
}
return;
}
sub find_arch_dir {
my ($input_node) = @_;
my $dir = "";
my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
#logger('DEBUG', "bibdir: $bib_dir");
if ($bib_dir) {
$dir = "$lib_arch_dir/$bib_dir";
if (-d $dir) {
logger('DEBUG', "directory $dir exists");
return $dir;
}
}
return;
}
sub convert_bib {
my ($input_node, $index_root, $index_doc) = @_;
my $cnt = 0;
my $type = "";
my $type_path = "";
# process general stuff first
foreach my $n ($input_node->getChildNodes()) {
my $name = $n->nodeName();
my $val = $n->textContent();
#logger('DEBUG', " NODE: $name = '$val'");
if (exists $gen_map{$name}) {
# is a general field
if ($name eq $lang_field) {
# language field -> convert to iso code
if (exists $lang_map{$val}) {
$val = $lang_map{$val};
} else {
logger('ERROR', "unknown language: $val! skipping...");
$errcnt++;
return 0;
}
}
create_element_path($gen_map{$name}, $index_root, $namespace)
->appendTextNode($val);
$cnt++;
} elsif (exists $type_map{$name}) {
# is a type field
$type_path = $type_map{$name};
$type = $val;
# check with known types
if (exists $subtype_map{$val}) {
my $indextype = $subtype_map{$val}->{'_name'};
create_element_path("$type_path=$indextype", $index_root, $namespace);
$cnt++;
} else {
logger('ERROR', 'unknown bib type $val! skipping...');
$errcnt++;
return 0;
}
}
}
# process sub type fields
if ($type) {
foreach my $n ($input_node->getChildNodes()) {
my $name = $n->nodeName();
my $val = $n->textContent();
#logger('DEBUG', " NODE: $name = '$val'");
if (exists $subtype_map{$type}->{$name}) {
create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
->appendTextNode($val);
$cnt++;
}
}
}
return $cnt;
}
sub process_all_fm_entries {
my ($input_root) = @_;
my $cnt = 0;
foreach my $n ($input_root->findnodes('fm:ROW')) {
logger('INFO', "processing entry $cnt ...");
process_fm_entry($n);
$cnt++;
}
}
sub process_fm_entry {
my ($input_node) = @_;
my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
my $index_root = $index_doc->createElementNS($namespace, 'resource');
$index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
$index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
$index_doc->setDocumentElement($index_root);
my $derived_from = "";
# try to find the document directory
my $doc_dir = "";
if ($online_mode) {
$doc_dir = find_permanent_dir($input_node);
$derived_from = find_arch_dir($input_node);
} elsif ($cw_mode) {
$doc_dir = find_cw_dir($input_node);
} elsif ($digifiles_mode) {
$doc_dir = find_digifiles_dir($input_node);
} else {
$doc_dir = find_arch_dir($input_node);
}
if (! $doc_dir) {
logger('ERROR', "document directory not found! skipping...");
$errcnt++;
return;
}
# add standard stuff to index.meta
my ($docname, $docpath) = split_file_path($doc_dir);
# name and date
create_text_path('name', $docname, $index_root, $namespace);
create_text_path('archive-path', $doc_dir, $index_root, $namespace);
create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
create_text_path('creator', 'digigroup', $index_root, $namespace);
create_text_path('description', 'a scanned document', $index_root, $namespace);
# acquisition
create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
# media
create_text_path('media-type', 'image', $index_root, $namespace);
create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
# derived-from
if ($derived_from) {
create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
}
# access
if ($access_type) {
if ($access_type eq "free") {
create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
} else {
my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
create_text_path('name', $access_type, $acc_tag, $namespace);
}
}
# convert bib entries
my $cnt = convert_bib($input_node, $index_root, $index_doc);
if ($cnt == 0) {
# error or nothing to convert
logger('ERROR', "no bibliographic metadata!");
$errcnt++;
return;
}
# write new index.meta file
if ($dry_run) {
logger('DEBUG', "would write $doc_dir/index.meta");
logger('DEBUG', $index_doc->toString(1));
} else {
write_xml($index_doc, "$doc_dir/index.meta");
}
}
#######################################################
# Main
#
# load filemaker xml dump
my ($input_doc, $input_root) = read_xml($infile);
# set namespace prefix
my $fm_namespace = $input_root->namespaceURI();
$input_root->setNamespace($fm_namespace, 'fm', 1);
# create digilib mapping file for digifiles mode
if ($digifiles_mode) {
$mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
$mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
$mapping_doc->setDocumentElement($mapping_root);
#<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
}
process_all_fm_entries($input_root);
logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
logger("ABORT", "there were errors!");
exit 1;
} else {
logger("DONE", "done something successfully!");
}
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>