view makemeta-lib.pl @ 18:fdf4ceb36db1

fixed problem with dir names in metacheck new version of metacheck defaults to not change index file new version of archiver uses new version of metacheck
author casties
date Tue, 20 Sep 2005 19:24:57 +0200
parents 6c5c7743acb1
children c3defe3e2780
line wrap: on
line source

#!/usr/local/bin/perl -w

use strict;
use XML::LibXML;

use lib '/usr/local/mpiwg/archive_devel';
use MPIWGStor;

# make output unbuffered
$|=1;

# program version
my $version = "0.2.2 (31.8.2005 ROC)";
my $help = 
"use: makemeta-lib [options] file.xml
options:
  -debug  show debugging info
  -dry-run  simulate, dont'do anything
  -online-mode  mode for creating online/permanent files
  -cw-mode  mode for copying einstein_cw archive documents
  -digifiles-mode  mode for copying files from digifiles
  -map-file=mapfile.xml  digilib mapping file (for digifiles mode)
";
logger("INFO", "makemeta-lib $version");

###########################################
# mappings

# generic mappings at top level
my %gen_map = (
    'Device' => 'meta/image-acquisition/device',
    'Image_Type' => 'meta/image-acquisition/image-type',
    'Production_Comment' => 'meta/image-acquisition/production-comment',
    'Postproduction' => 'meta/image-acquisition/production-comment',
    'Language' => 'meta/lang'
    );
# sub type switch tag
my %type_map = (
    'Reference_Type' => 'meta/bib@type'
    );
# sub type mappings
my %subtype_map = (
    'Book' => {
	'_name' => 'book',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition'
    },
    'Journal Article' => {
	'_name' => 'journal-article',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Secondary_Title' => 'meta/bib/journal',
	'Volume' => 'meta/bib/volume',
	'Number' => 'meta/bib/issue',
	'Pages' => 'meta/bib/pages'
    },
    'In Book' => {
	'_name' => 'inbook',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Secondary_Title' => 'meta/bib/book-title',
	'Pages' => 'meta/bib/pages'
    },
    'Newspaper Article' => {
	'_name' => 'newspaper-article',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Secondary_Title' => 'meta/bib/newspaper',
	'Place_Published' => 'meta/bib/city',
	'Number' => 'meta/bib/issue-date',
	'Pages' => 'meta/bib/pages'
    },
    'Edited Book' => {
	'_name' => 'edited-book',
	'Author' => 'meta/bib/editor',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition'
    },
    'Manuscript' => {
	'_name' => 'manuscript',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/location',
    }
    );
# language element
my $lang_field = 'Language';
# languages to iso codes
my %lang_map = (
    'German' => 'de',
    'English' => 'en',
    'Italian' => 'it',
    'French' => 'fr',
    'Latin' => 'la',
    'Japanese' => 'ja',
    'Dutch' => 'nl',
    'Spanish' => 'es'
    );
# storage fields
my $arch_id_field = 'ID_Archive';
my $online_url_field = 'URL';
my $online_id_field = 'ID_OnlinePermanent';

#######################################################
# internal parameters
#

# storage
my $lib_arch_dir = '/mpiwg/archive/data/library';
my $lib_online_dir = '/mpiwg/online/permanent';
my $lib_digilib_path = 'permanent';
my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";

# read command line parameters
my $args = MPIWGStor::parseargs;
if (! scalar(%$args)) {
    print $help, "\n";
    exit 1;
}

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# simulate action only
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
logger('DEBUG', "dry-run: $dry_run");

# use online mode
my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
logger('DEBUG', "online_mode: $online_mode");

# use einstein-cw mode
my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
logger('DEBUG', "cw_mode: $cw_mode");

# use digifiles mode
my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
logger('DEBUG', "digifiles_mode: $digifiles_mode");
# digilib mapping file
my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
logger('DEBUG', "map_file_name: $map_file_name");
my $mapping_doc;
my $mapping_root;

# index.meta namespace (not really implemented!)
my $namespace = "";


my $xml_changed = 0;
my $errcnt = 0;
my $warncnt = 0;

#######################################################
# check parameters that were passed to the program
#
my $infile = $$args{'path'};
if (! $infile) {
    logger("ABORT", "no input file given!");
    exit 1;
}
# strip double slashes
$infile =~ s/\/\//\//;
if (! -f $infile) {
    logger("ABORT", "input file \'$infile\' doesn't exist!");
    exit 1;
}


#######################################################
# subroutines
#


sub add_digilib_mapping {
    my ($src_dir, $dest_dir) = @_;
    my $elem = $mapping_root->addNewChild($namespace, 'mapping');
    $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
    $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
    if ($map_file_name) {
	write_xml($mapping_doc, $map_file_name);
    } else {
	logger('ABORT', "unable to write mapping file!");
	exit 1;
    }
}

sub find_digifiles_dir {
    my ($input_node) = @_;
    my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
    my $src_dir = find_online_dir($input_node, $digifiles_base, '');
    if (! $src_dir) {
	logger('ERROR', "no online directory for digifiles entry");
	$errcnt++;
	return;
    }
    my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
    if (! $dest_id) {
	logger('ERROR', "no ID field for digifiles entry");
	$errcnt++;
	return;
    }
    my $dir = "$lib_online_dir/library/$dest_id";
    my $map_dir = "$lib_digilib_path/library/$dest_id";
    if ($dry_run) {
	logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
	add_digilib_mapping($src_dir, "$map_dir/pageimg");
	return $dir;
    } else {
	logger('INFO', "moving $digifiles_base/$src_dir to $dir");
	logger('DEBUG', "mkdir $dir/pageimg"); 
	if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
	    logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg"); 
	    if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
		if (-d "$dir/pageimg") {
		    logger('DEBUG', "directory $dir OK"); 
		    add_digilib_mapping($src_dir, "$map_dir/pageimg");
		    if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
			logger('DEBUG', "directory $digifiles_base/$src_dir removed"); 
			return $dir;
		    } else {
			logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
			$errcnt++;
			return $dir;
		    }
		}
	    }
	}
	logger('ABORT', "unable to copy directory $src_dir to $dir!");
	exit 1;
    }
    return;
}

sub find_cw_dir {
    my ($input_node) = @_;
    my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
    my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
    my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
    if (! $dest_id) {
	logger('ERROR', "no ID field for einstein-cw entry");
	$errcnt++;
	return;
    }
    my $dir = "$lib_arch_dir/$dest_id";
    if ($dry_run) {
	logger('DEBUG', "would move $cw_base/$src_dir to $dir");
	return $dir;
    } else {
	logger('DEBUG', "moving $cw_base/$src_dir to $dir");
	if (rename "$cw_base/$src_dir", $dir) {
	    if (-d $dir) {
		logger('DEBUG', "directory $dir OK"); 
		return $dir;
	    }
	} else {
	    logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
	    exit 1;
	}
    }
    return;
}

sub find_permanent_dir {
    my ($input_node) = @_;
    my $online_base = '/mpiwg/online/permanent';
    my $src_dir = find_online_dir($input_node, $online_base, 'pageimg');
    my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
    if (! $dest_id) {
	logger('ERROR', "no ID field for online permanent entry");
	$errcnt++;
	return;
    }
    my $dir = "$online_base/$src_dir";
    return $dir;
}

#
# $dir = find_online_dir($input_node, $base_dir, $page_dir)
#
# Takes the path from the $online_url_field of the $input_node document
# and looks in the directory $base_dir for it. Strips $page_dir from the end.
# Returns the directory path sans $base_dir if it exists
#
sub find_online_dir {
    my ($input_node, $base_dir, $page_dir) = @_;
    $base_dir = $lib_online_dir unless ($base_dir);

    my $online_url = $input_node->findvalue("fm:$online_url_field");
    logger('DEBUG', "checking URL: $online_url");
    my $online_dir;
    if ($online_url =~ /fn=permanent\/(.+)/) {
	# new style digilib URL
	$online_dir = $1;
    } elsif ($online_url =~ /\?([^\+]+)\+/) {
	# old style digilib URL
	$online_dir = $1;
    }
    #logger('DEBUG', "online_dir1: $online_dir");
    if ($online_dir) {
	$online_dir =~ s/\/$//; # strip ending slashes
	if ($page_dir) {
	  $online_dir =~ s/\/${page_dir}$//;
	}
	#logger("DEBUG", "dir: $base_dir/$online_dir");
	if (-d "$base_dir/$online_dir") {
	    logger('DEBUG', "directory $base_dir/$online_dir exists");
	    return $online_dir;
	}
    }
    return;
}

sub find_arch_dir {
    my ($input_node) = @_;
    my $dir = "";

    my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
    #logger('DEBUG', "bibdir: $bib_dir");
    if ($bib_dir) {
	$dir = "$lib_arch_dir/$bib_dir";
	if (-d $dir) {
	    logger('DEBUG', "directory $dir exists"); 
	    return $dir;
	}
    }
    return;
}


sub convert_bib {
    my ($input_node, $index_root, $index_doc) = @_;
    my $cnt = 0;
    my $type = "";
    my $type_path = "";

    # process general stuff first
    foreach my $n ($input_node->getChildNodes()) {
	my $name = $n->nodeName();
	my $val = $n->textContent();
	#logger('DEBUG', "  NODE: $name = '$val'");
	if (exists $gen_map{$name}) {
	    # is a general field
	    if ($name eq $lang_field) {
		# language field -> convert to iso code
		if (exists $lang_map{$val}) {
		    $val = $lang_map{$val};
		} else {
		    logger('ERROR', "unknown language: $val! skipping...");
		    $errcnt++;
		    return 0;
		}
	    }
	    create_element_path($gen_map{$name}, $index_root, $namespace)
		->appendTextNode($val);
	    $cnt++;
	} elsif (exists $type_map{$name}) {
	    # is a type field
	    $type_path = $type_map{$name};
	    $type = $val;
	    # check with known types
	    if (exists $subtype_map{$val}) {
		my $indextype = $subtype_map{$val}->{'_name'};
		create_element_path("$type_path=$indextype", $index_root, $namespace);
		$cnt++;
	    } else {
		logger('ERROR', 'unknown bib type $val! skipping...');
		$errcnt++;
		return 0;
	    }
	}
    }
    # process sub type fields
    if ($type) {
	foreach my $n ($input_node->getChildNodes()) {
	    my $name = $n->nodeName();
	    my $val = $n->textContent();
	    #logger('DEBUG', "  NODE: $name = '$val'");
	    if (exists $subtype_map{$type}->{$name}) {
		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
		    ->appendTextNode($val);
		$cnt++;
	    }
	}
    }
    return $cnt;
}



sub process_all_fm_entries {
    my ($input_root) = @_;
    my $cnt = 0;

    foreach my $n ($input_root->findnodes('fm:ROW')) {
	logger('INFO', "processing entry $cnt ...");
	process_fm_entry($n);
	$cnt++;
    }
}    


sub process_fm_entry {
    my ($input_node) = @_;
    my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
    my $index_root = $index_doc->createElementNS($namespace, 'resource');
    $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
    $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
    $index_doc->setDocumentElement($index_root);

    # try to find the document directory
    my $doc_dir = "";
    if ($online_mode) {
	$doc_dir = find_permanent_dir($input_node);
    } elsif ($cw_mode) {
	$doc_dir = find_cw_dir($input_node);
    } elsif ($digifiles_mode) {
	$doc_dir = find_digifiles_dir($input_node);
    } else {
	$doc_dir = find_arch_dir($input_node);
    }
    if (! $doc_dir) {
	logger('ERROR', "document directory not found! skipping...");
	$errcnt++;
	return;
    }

    # add standard stuff to index.meta
    my ($docname, $docpath) = split_file_path($doc_dir);
    # name and date
    create_text_path('name', $docname, $index_root, $namespace);
    create_text_path('archive-path', $doc_dir, $index_root, $namespace);
    create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
    create_text_path('creator', 'digigroup', $index_root, $namespace);
    create_text_path('description', 'a scanned document', $index_root, $namespace);
    # acquisition
    create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
    create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
    create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
    # media
    create_text_path('media-type', 'image', $index_root, $namespace);
    create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);

    # convert bib entries
    my $cnt = convert_bib($input_node, $index_root, $index_doc);
    if ($cnt == 0) {
	# error or nothing to convert
	logger('ERROR', "no bibliographic metadata!");
	$errcnt++;
	return;
    }

    # write new index.meta file
    if ($dry_run) {
	logger('DEBUG', "would write $doc_dir/index.meta");
	logger('DEBUG', $index_doc->toString(1));
    } else {
	write_xml($index_doc, "$doc_dir/index.meta");
    }

}





#######################################################
# Main
#

# load filemaker xml dump
my ($input_doc, $input_root) = read_xml($infile);
# set namespace prefix
my $fm_namespace = $input_root->namespaceURI();
$input_root->setNamespace($fm_namespace, 'fm', 1);

# create digilib mapping file for digifiles mode
if ($digifiles_mode) {
    $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
    $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
    $mapping_doc->setDocumentElement($mapping_root);
#<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>

}

process_all_fm_entries($input_root);


logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
    logger("ABORT", "there were errors!");
    exit 1;
} else {
    logger("DONE", "done something successfully!");
}