File:  [Repository] / foxridge-archiver / makemeta-vlp.pl
Revision 1.10: download - view: text, annotated - select for diffs - revision graph
Thu Mar 16 17:00:43 2017 UTC (7 years, 1 month ago) by casties
Branches: MAIN
CVS tags: HEAD
updated to Ubuntu Perl paths.

#!/usr/bin/perl -w

use strict;
use XML::LibXML;

use lib '/usr/local/mpiwg/archive';
use MPIWGStor;

# make output unbuffered
$|=1;

# program version
my $version = "0.2.7 (27.8.2010 ROC)";
my $help = 
"use: makemeta-vlp [options] file.xml
options:
  -debug  show debugging info
  -dry-run  simulate, dont'do anything
  -replace  replace existing index files
  -online-mode  mode for creating online/permanent files
  -archive-mode  mode for creating archive/data files
  -access=free  adds free access tag for online-mode
  -texttool adds texttool tag for online-mode
";
logger("INFO", "makemeta-vlp $version");

###########################################
# mappings

# generic mappings at top level
my %gen_map = (
    'Custom2_Language' => 'meta/lang',
    'productionComment' => 'meta/image-acquisition/production-comment',
    'derivedFrom' => 'derived-from/archive-path'
    );
# sub type switch tag
my %type_map = (
    'ReferenceType' => 'meta/bib@type'
    );
# sub type mappings
my %subtype_map = (
    'Book' => {
	'_name' => 'book',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'Pages' => 'meta/bib/number-of-pages'
    },
    '(Book)' => {
	'_name' => 'book',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'Pages' => 'meta/bib/number-of-pages',
	'#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
    },
    'Book Section' => {
	'_name' => 'inbook',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'SecondaryTitle' => 'meta/bib/book-title',
	'SecondaryAuthor' => 'meta/bib/editor',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'Pages' => 'meta/bib/pages'
    },
    'Edited Book' => {
	'_name' => 'edited-book',
	'Author' => 'meta/bib/editor',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'Pages' => 'meta/bib/number-of-pages',
	'#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
    },
    '(Edited Book)' => {
	'_name' => 'edited-book',
	'Author' => 'meta/bib/editor',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/publisher',
	'Edition' => 'meta/bib/edition',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'Pages' => 'meta/bib/number-of-pages'
    },
    'Journal Article' => {
	'_name' => 'journal-article',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'SecondaryTitle' => 'meta/bib/journal',
	'Volume' => 'meta/bib/volume',
	'Number_Issue' => 'meta/bib/issue',
	'Pages' => 'meta/bib/pages'
    },
    '(JournalVolume)' => {
	'_name' => 'journal-volume',
	'SecondaryTitle' => 'meta/bib/title',
	'SecondaryAuthor' => 'meta/bib/editor',
	'Publisher' => 'meta/bib/publisher',
	'Place_Published' => 'meta/bib/city',
	'Year' => 'meta/bib/year',
	'Volume' => 'meta/bib/volume',
	'Pages' => 'meta/bib/number-of-pages',
	'#Cover pages only, articles have been extracted' => 'meta/bib/comment'
    },
    'Journal' => {
	'_name' => 'report',
	'Title' => 'meta/bib/title',
	'SecondaryTitle' => 'meta/bib/institution',
	'Author' => 'meta/bib/author',
	'Place_Published' => 'meta/bib/city',
	'Year' => 'meta/bib/year',
	'Date' => 'meta/bib/date',
	'Pages' => 'meta/bib/pages',
    },
    'Magazine Article' => {
	'_name' => 'magazine-article',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Secondary_Title' => 'meta/bib/magazine',
	'Number_Issue' => 'meta/bib/issue-number',
	'Date' => 'meta/bib/issue-date',
	'Pages' => 'meta/bib/pages'
    },
    'Newspaper Article' => {
	'_name' => 'newspaper-article',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Secondary_Title' => 'meta/bib/newspaper',
	'Date' => 'meta/bib/issue-date',
	'Pages' => 'meta/bib/pages'
    },
    'Report' => {
	'_name' => 'report',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Date' => 'meta/bib/date',
	'SecondaryTitle' => 'meta/bib/type',
	'Pages' => 'meta/bib/pages'
    },
    'Trade Catalogue' => {
	'_name' => 'report',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/city',
	'Date' => 'meta/bib/date',
	'Volume' => 'meta/bib/volume',
	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
	'ReferenceType' => 'meta/bib/type',
	'Pages' => 'meta/bib/pages'
    },
    'Thesis' => {
	'_name' => 'thesis',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Place_Published' => 'meta/bib/city',
	'Publisher' => 'meta/bib/university',
	'Date' => 'meta/bib/date',
	'TypeOfWork' => 'meta/bib/type',
	'Pages' => 'meta/bib/number-of-pages'
    },
    'Manuscript' => {
	'_name' => 'manuscript',
	'Author' => 'meta/bib/author',
	'Title' => 'meta/bib/title',
	'Year' => 'meta/bib/year',
	'Place_Published' => 'meta/bib/location',
	'Pages' => 'meta/bib/pages'
    }
    );
# language element
my $lang_field = 'Custom2_Language';
# languages to iso codes
my %lang_map = (
    'German' => 'de',
    'English' => 'en',
    'Italian' => 'it',
    'French' => 'fr',
    'Latin' => 'la',
    'Japanese' => 'ja',
    'Dutch' => 'nl',
    'Spanish' => 'es',
    'Swedish' => 'sv',
    'Russian' => 'ru',
    'Polish' => 'pl',
    'Greek' => 'el'
    );
# storage fields
my $arch_id_field = 'ID';
my $access_free_field = 'online';

#######################################################
# internal parameters
#

# storage
my $lib_arch_dir = '/mpiwg/archive/data/vlp';
my $lib_online_dir = '/mpiwg/online/permanent/vlp';

# read command line parameters
my $args = MPIWGStor::parseargs;
if (! scalar(%$args)) {
    print $help, "\n";
    exit 1;
}

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# simulate action only
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
logger('DEBUG', "dry-run: $dry_run");

# replace existing index files
my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
logger('DEBUG', "replace: $do_replace");

# use online mode
my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
logger('DEBUG', "online_mode: $online_mode");

# use archive mode
my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
logger('DEBUG', "archive_mode: $archive_mode");

# create texttool tag (online mode only)
my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1;
logger('DEBUG', "texttool: $texttool");
# image dir for texttool
my $texttool_img_dir = "pages";

# access type
my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";

# index.meta namespace (not really implemented!)
my $namespace = "";


my $xml_changed = 0;
my $errcnt = 0;
my $warncnt = 0;

#######################################################
# check parameters that were passed to the program
#
my $infile = $$args{'path'};
if (! $infile) {
    logger("ABORT", "no input file given!");
    exit 1;
}
# strip double slashes
$infile =~ s/\/\//\//;
if (! -f $infile) {
    logger("ABORT", "input file \'$infile\' doesn't exist!");
    exit 1;
}


#######################################################
# subroutines
#


sub find_arch_dir {
    my ($input_node) = @_;
    my $dir = "";

    my $bib_id = $input_node->findvalue("fm:$arch_id_field");
    #logger('DEBUG', "bibdir: $bib_dir");
    if ($bib_id) {
	$dir = "$lib_arch_dir/lit$bib_id";
	if (-d $dir) {
	    logger('DEBUG', "directory $dir exists"); 
	    return $dir;
	}
    }
    return;
}

sub find_permanent_dir {
    my ($input_node) = @_;
    my $online_base = $lib_online_dir;
    my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
    if (! $dest_id) {
	logger('ERROR', "no ID field for online permanent entry");
	$errcnt++;
	return;
    }
    my $dir = "$online_base/lit$dest_id";
    if (-d $dir) {
        logger('DEBUG', "directory $dir exists"); 
        return $dir;
    }
    return;
}


sub convert_bib {
    my ($input_node, $index_root, $index_doc) = @_;
    my $cnt = 0;
    my $type = "";
    my $type_path = "";

    # process general stuff first
    foreach my $n ($input_node->getChildNodes()) {
	my $name = $n->nodeName();
	my $val = $n->textContent();
	#logger('DEBUG', "  NODE: $name = '$val'");
	if (exists $gen_map{$name}) {
	    # is a general field
	    if ($name eq $lang_field) {
		# language field
		if (not $val) {
		    logger('WARNING', "no language tag");
		    $warncnt++;
		    next;
		}
		# convert to iso code
		if (exists $lang_map{$val}) {
		    $val = $lang_map{$val};
		} else {
		    logger('ERROR', "unknown language: $val! skipping...");
		    $errcnt++;
		    return 0;
		}
	    }
	    create_element_path($gen_map{$name}, $index_root, $namespace)
		->appendTextNode($val);
	    $cnt++;
	} elsif (exists $type_map{$name}) {
	    # is a type field
	    $type_path = $type_map{$name};
	    $type = $val;
	    # check with known types
	    if (exists $subtype_map{$val}) {
		my $indextype = $subtype_map{$val}->{'_name'};
		create_element_path("$type_path=$indextype", $index_root, $namespace);
		$cnt++;
	    } else {
		logger('ERROR', "unknown bib type $val! skipping...");
		$errcnt++;
		return 0;
	    }
	}
    }
    # process sub type fields
    if ($type) {
	foreach my $n ($input_node->getChildNodes()) {
	    my $name = $n->nodeName();
	    my $val = $n->textContent();
	    #logger('DEBUG', "  NODE: $name = '$val'");
	    if (exists $subtype_map{$type}->{$name}) {
		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
		    ->appendTextNode($val);
		$cnt++;
	    }
	}
	# append additional constant fields (beginning with #)
	foreach my $k (keys %{$subtype_map{$type}}) {
	    if ($k =~ /^\#(.*)/) {
		my $val = $1;
		create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
	    }
	}
    }
    return $cnt;
}



sub process_all_fm_entries {
    my ($input_root) = @_;
    my $cnt = 0;

    foreach my $n ($input_root->findnodes('fm:ROW')) {
	logger('INFO', "processing entry $cnt ...");
	process_fm_entry($n);
	$cnt++;
    }
}    


sub process_fm_entry {
    my ($input_node) = @_;
    my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
    my $index_root = $index_doc->createElementNS($namespace, 'resource');
    $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
    $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
    $index_doc->setDocumentElement($index_root);

    # try to find the document directory
    my $doc_dir = "";
    if ($online_mode) {
	$doc_dir = find_permanent_dir($input_node);
    } elsif ($archive_mode) {
	$doc_dir = find_arch_dir($input_node);
    } else {
	$doc_dir = find_permanent_dir($input_node);
    }
    if (! $doc_dir) {
	logger('ERROR', "document directory not found! skipping...");
	$errcnt++;
	return;
    }

    # check if index.meta exists
    if ( -f "$doc_dir/index.meta") {
	if (not $do_replace) {
	    logger('DEBUG', "index file in $doc_dir exists");
	    return;
	}
    }

    # add standard stuff to index.meta
    my ($docname, $docpath) = split_file_path($doc_dir);
    # name and date
    create_text_path('name', $docname, $index_root, $namespace);
    create_text_path('archive-path', $doc_dir, $index_root, $namespace);
    create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
    create_text_path('creator', 'vlp', $index_root, $namespace);
    create_text_path('description', 'a scanned document', $index_root, $namespace);
    if ($archive_mode) {
      # acquisition
      create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
      create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
      create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
    }
    # media
    create_text_path('media-type', 'image', $index_root, $namespace);
    create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
    # access
    if ($access_type) {
	if ($access_type eq "free") {
	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
	} else {
	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
	    create_text_path('name', $access_type, $acc_tag, $namespace);
	}
    } elsif ($online_mode) {
        # read access conditions from "online" field in DB dump
        my $online = sstrip($input_node->findvalue("fm:$access_free_field"));
        if ($online) {
	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
	} else {
	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
	    create_text_path('name', 'mpiwg', $acc_tag, $namespace);
	}
    }

    # texttool tag with image dir
    if ($online_mode && $texttool) {
	if ( -d "$doc_dir/$texttool_img_dir" ) {
	    create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace);
	} else {
            logger('WARNING', "page image directory missing!");
            $warncnt++;
        }
    }

    # convert bib entries
    my $cnt = convert_bib($input_node, $index_root, $index_doc);
    if ($cnt == 0) {
	# error or nothing to convert
	logger('ERROR', "no bibliographic metadata!");
	$errcnt++;
	return;
    }

    # write new index.meta file
    if ($dry_run) {
	logger('DEBUG', "would write $doc_dir/index.meta");
	logger('DEBUG', $index_doc->toString(1));
    } else {
	write_xml($index_doc, "$doc_dir/index.meta");
    }

}





#######################################################
# Main
#

# load filemaker xml dump
my ($input_doc, $input_root) = read_xml($infile);
# set namespace prefix
my $fm_namespace = $input_root->namespaceURI();
$input_root->setNamespace($fm_namespace, 'fm', 1);


process_all_fm_entries($input_root);


logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
    logger("ABORT", "there were errors!");
    exit 1;
} else {
    logger("DONE", "done something successfully!");
}


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>