File:  [Repository] / foxridge-archiver / makemeta-vlp.pl
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Tue Sep 20 17:44:48 2005 UTC (18 years, 9 months ago) by casties
Branches: MAIN
CVS tags: HEAD
corrected perl lib path

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2 (19.9.2005 ROC)";
   14: my $help = 
   15: "use: makemeta-vlp [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -replace  replace existing index files
   20:   -online-mode  mode for creating online/permanent files
   21:   -archive-mode  mode for creating archive/data files
   22: ";
   23: logger("INFO", "makemeta-vlp $version");
   24: 
   25: ###########################################
   26: # mappings
   27: 
   28: # generic mappings at top level
   29: my %gen_map = (
   30:     'Custom2_Language' => 'meta/lang'
   31:     );
   32: # sub type switch tag
   33: my %type_map = (
   34:     'ReferenceType' => 'meta/bib@type'
   35:     );
   36: # sub type mappings
   37: my %subtype_map = (
   38:     'Book' => {
   39: 	'_name' => 'book',
   40: 	'Author' => 'meta/bib/author',
   41: 	'Title' => 'meta/bib/title',
   42: 	'Year' => 'meta/bib/year',
   43: 	'Place_Published' => 'meta/bib/city',
   44: 	'Publisher' => 'meta/bib/publisher',
   45: 	'Edition' => 'meta/bib/edition',
   46: 	'Volume' => 'meta/bib/volume',
   47: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   48: 	'Pages' => 'meta/bib/number-of-pages'
   49:     },
   50:     'Book Section' => {
   51: 	'_name' => 'inbook',
   52: 	'Author' => 'meta/bib/author',
   53: 	'Title' => 'meta/bib/title',
   54: 	'Year' => 'meta/bib/year',
   55: 	'Secondary_Title' => 'meta/bib/book-title',
   56: 	'SecondaryAuthor' => 'meta/bib/editor',
   57: 	'Volume' => 'meta/bib/volume',
   58: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   59: 	'Pages' => 'meta/bib/pages'
   60:     },
   61:     'Edited Book' => {
   62: 	'_name' => 'edited-book',
   63: 	'Author' => 'meta/bib/editor',
   64: 	'Title' => 'meta/bib/title',
   65: 	'Year' => 'meta/bib/year',
   66: 	'Place_Published' => 'meta/bib/city',
   67: 	'Publisher' => 'meta/bib/publisher',
   68: 	'Edition' => 'meta/bib/edition',
   69: 	'Volume' => 'meta/bib/volume',
   70: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   71: 	'Pages' => 'meta/bib/number-of-pages'
   72:     },
   73:     'Journal Article' => {
   74: 	'_name' => 'journal-article',
   75: 	'Author' => 'meta/bib/author',
   76: 	'Title' => 'meta/bib/title',
   77: 	'Year' => 'meta/bib/year',
   78: 	'SecondaryTitle' => 'meta/bib/journal',
   79: 	'Volume' => 'meta/bib/volume',
   80: 	'Number_Issue' => 'meta/bib/issue',
   81: 	'Pages' => 'meta/bib/pages'
   82:     },
   83:     'Magazine Article' => {
   84: 	'_name' => 'magazine-article',
   85: 	'Author' => 'meta/bib/author',
   86: 	'Title' => 'meta/bib/title',
   87: 	'Year' => 'meta/bib/year',
   88: 	'Secondary_Title' => 'meta/bib/magazine',
   89: 	'Number_Issue' => 'meta/bib/issue-number',
   90: 	'Date' => 'meta/bib/issue-date',
   91: 	'Pages' => 'meta/bib/pages'
   92:     },
   93:     'Report' => {
   94: 	'_name' => 'report',
   95: 	'Author' => 'meta/bib/author',
   96: 	'Title' => 'meta/bib/title',
   97: 	'Year' => 'meta/bib/year',
   98: 	'Place_Published' => 'meta/bib/city',
   99: 	'Date' => 'meta/bib/date',
  100: 	'SecondaryTitle' => 'meta/bib/type',
  101: 	'Pages' => 'meta/bib/pages'
  102:     },
  103:     'Trade Catalogue' => {
  104: 	'_name' => 'report',
  105: 	'Author' => 'meta/bib/author',
  106: 	'Title' => 'meta/bib/title',
  107: 	'Year' => 'meta/bib/year',
  108: 	'Place_Published' => 'meta/bib/city',
  109: 	'Date' => 'meta/bib/date',
  110: 	'Volume' => 'meta/bib/volume',
  111: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
  112: 	'ReferenceType' => 'meta/bib/type',
  113: 	'Pages' => 'meta/bib/pages'
  114:     },
  115:     'Thesis' => {
  116: 	'_name' => 'thesis',
  117: 	'Author' => 'meta/bib/author',
  118: 	'Title' => 'meta/bib/title',
  119: 	'Place_Published' => 'meta/bib/city',
  120: 	'Publisher' => 'meta/bib/university',
  121: 	'Date' => 'meta/bib/date',
  122: 	'TypeOfWork' => 'meta/bib/type',
  123: 	'Pages' => 'meta/bib/number-of-pages'
  124:     },
  125:     'Manuscript' => {
  126: 	'_name' => 'manuscript',
  127: 	'Author' => 'meta/bib/author',
  128: 	'Title' => 'meta/bib/title',
  129: 	'Year' => 'meta/bib/year',
  130: 	'Place_Published' => 'meta/bib/location',
  131: 	'Pages' => 'meta/bib/pages'
  132:     }
  133:     );
  134: # language element
  135: my $lang_field = 'Custom2_Language';
  136: # languages to iso codes
  137: my %lang_map = (
  138:     'German' => 'de',
  139:     'English' => 'en',
  140:     'Italian' => 'it',
  141:     'French' => 'fr',
  142:     'Latin' => 'la',
  143:     'Japanese' => 'ja',
  144:     'Dutch' => 'nl',
  145:     'Spanish' => 'es',
  146:     'Swedish' => 'sv'
  147:     );
  148: # storage fields
  149: my $arch_id_field = 'ID';
  150: 
  151: #######################################################
  152: # internal parameters
  153: #
  154: 
  155: # storage
  156: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
  157: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
  158: 
  159: # read command line parameters
  160: my $args = MPIWGStor::parseargs;
  161: if (! scalar(%$args)) {
  162:     print $help, "\n";
  163:     exit 1;
  164: }
  165: 
  166: # debug level
  167: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  168: 
  169: # simulate action only
  170: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  171: logger('DEBUG', "dry-run: $dry_run");
  172: 
  173: # replace existing index files
  174: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
  175: logger('DEBUG', "replace: $do_replace");
  176: 
  177: # use online mode
  178: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
  179: logger('DEBUG', "online_mode: $online_mode");
  180: 
  181: # use archive mode
  182: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
  183: logger('DEBUG', "archive_mode: $archive_mode");
  184: 
  185: # index.meta namespace (not really implemented!)
  186: my $namespace = "";
  187: 
  188: 
  189: my $xml_changed = 0;
  190: my $errcnt = 0;
  191: my $warncnt = 0;
  192: 
  193: #######################################################
  194: # check parameters that were passed to the program
  195: #
  196: my $infile = $$args{'path'};
  197: if (! $infile) {
  198:     logger("ABORT", "no input file given!");
  199:     exit 1;
  200: }
  201: # strip double slashes
  202: $infile =~ s/\/\//\//;
  203: if (! -f $infile) {
  204:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  205:     exit 1;
  206: }
  207: 
  208: 
  209: #######################################################
  210: # subroutines
  211: #
  212: 
  213: 
  214: sub find_arch_dir {
  215:     my ($input_node) = @_;
  216:     my $dir = "";
  217: 
  218:     my $bib_id = $input_node->findvalue("fm:$arch_id_field");
  219:     #logger('DEBUG', "bibdir: $bib_dir");
  220:     if ($bib_id) {
  221: 	$dir = "$lib_arch_dir/lit$bib_id";
  222: 	if (-d $dir) {
  223: 	    logger('DEBUG', "directory $dir exists"); 
  224: 	    return $dir;
  225: 	}
  226:     }
  227:     return;
  228: }
  229: 
  230: sub find_permanent_dir {
  231:     my ($input_node) = @_;
  232:     my $online_base = '/mpiwg/online/permanent';
  233:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  234:     if (! $dest_id) {
  235: 	logger('ERROR', "no ID field for online permanent entry");
  236: 	$errcnt++;
  237: 	return;
  238:     }
  239:     my $dir = "$online_base/lit$dest_id";
  240:     return $dir;
  241: }
  242: 
  243: 
  244: sub convert_bib {
  245:     my ($input_node, $index_root, $index_doc) = @_;
  246:     my $cnt = 0;
  247:     my $type = "";
  248:     my $type_path = "";
  249: 
  250:     # process general stuff first
  251:     foreach my $n ($input_node->getChildNodes()) {
  252: 	my $name = $n->nodeName();
  253: 	my $val = $n->textContent();
  254: 	#logger('DEBUG', "  NODE: $name = '$val'");
  255: 	if (exists $gen_map{$name}) {
  256: 	    # is a general field
  257: 	    if ($name eq $lang_field) {
  258: 		# language field
  259: 		if (not $val) {
  260: 		    logger('WARNING', "no language tag");
  261: 		    $warncnt++;
  262: 		    next;
  263: 		}
  264: 		# convert to iso code
  265: 		if (exists $lang_map{$val}) {
  266: 		    $val = $lang_map{$val};
  267: 		} else {
  268: 		    logger('ERROR', "unknown language: $val! skipping...");
  269: 		    $errcnt++;
  270: 		    return 0;
  271: 		}
  272: 	    }
  273: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  274: 		->appendTextNode($val);
  275: 	    $cnt++;
  276: 	} elsif (exists $type_map{$name}) {
  277: 	    # is a type field
  278: 	    $type_path = $type_map{$name};
  279: 	    $type = $val;
  280: 	    # check with known types
  281: 	    if (exists $subtype_map{$val}) {
  282: 		my $indextype = $subtype_map{$val}->{'_name'};
  283: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  284: 		$cnt++;
  285: 	    } else {
  286: 		logger('ERROR', "unknown bib type $val! skipping...");
  287: 		$errcnt++;
  288: 		return 0;
  289: 	    }
  290: 	}
  291:     }
  292:     # process sub type fields
  293:     if ($type) {
  294: 	foreach my $n ($input_node->getChildNodes()) {
  295: 	    my $name = $n->nodeName();
  296: 	    my $val = $n->textContent();
  297: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  298: 	    if (exists $subtype_map{$type}->{$name}) {
  299: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  300: 		    ->appendTextNode($val);
  301: 		$cnt++;
  302: 	    }
  303: 	}
  304:     }
  305:     return $cnt;
  306: }
  307: 
  308: 
  309: 
  310: sub process_all_fm_entries {
  311:     my ($input_root) = @_;
  312:     my $cnt = 0;
  313: 
  314:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  315: 	logger('INFO', "processing entry $cnt ...");
  316: 	process_fm_entry($n);
  317: 	$cnt++;
  318:     }
  319: }    
  320: 
  321: 
  322: sub process_fm_entry {
  323:     my ($input_node) = @_;
  324:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  325:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  326:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  327:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  328:     $index_doc->setDocumentElement($index_root);
  329: 
  330:     # try to find the document directory
  331:     my $doc_dir = "";
  332:     if ($online_mode) {
  333: 	$doc_dir = find_permanent_dir($input_node);
  334:     } elsif ($archive_mode) {
  335: 	$doc_dir = find_arch_dir($input_node);
  336:     } else {
  337: 	$doc_dir = find_permanent_dir($input_node);
  338:     }
  339:     if (! $doc_dir) {
  340: 	logger('ERROR', "document directory not found! skipping...");
  341: 	$errcnt++;
  342: 	return;
  343:     }
  344: 
  345:     # check if index.meta exists
  346:     if ( -f "$doc_dir/index.meta") {
  347: 	if (not $do_replace) {
  348: 	    logger('DEBUG', "index file in $doc_dir exists");
  349: 	    return;
  350: 	}
  351:     }
  352: 
  353:     # add standard stuff to index.meta
  354:     my ($docname, $docpath) = split_file_path($doc_dir);
  355:     # name and date
  356:     create_text_path('name', $docname, $index_root, $namespace);
  357:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  358:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  359:     create_text_path('creator', 'vlp', $index_root, $namespace);
  360:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  361:     if ($archive_mode) {
  362:       # acquisition
  363:       create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  364:       create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
  365:       create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  366:       # image acquisition
  367:       create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
  368:       create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
  369:       create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
  370:     }
  371:     # media
  372:     create_text_path('media-type', 'image', $index_root, $namespace);
  373:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  374: 
  375:     # convert bib entries
  376:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  377:     if ($cnt == 0) {
  378: 	# error or nothing to convert
  379: 	logger('ERROR', "no bibliographic metadata!");
  380: 	$errcnt++;
  381: 	return;
  382:     }
  383: 
  384:     # write new index.meta file
  385:     if ($dry_run) {
  386: 	logger('DEBUG', "would write $doc_dir/index.meta");
  387: 	logger('DEBUG', $index_doc->toString(1));
  388:     } else {
  389: 	write_xml($index_doc, "$doc_dir/index.meta");
  390:     }
  391: 
  392: }
  393: 
  394: 
  395: 
  396: 
  397: 
  398: #######################################################
  399: # Main
  400: #
  401: 
  402: # load filemaker xml dump
  403: my ($input_doc, $input_root) = read_xml($infile);
  404: # set namespace prefix
  405: my $fm_namespace = $input_root->namespaceURI();
  406: $input_root->setNamespace($fm_namespace, 'fm', 1);
  407: 
  408: 
  409: process_all_fm_entries($input_root);
  410: 
  411: 
  412: logger("INFO", "$warncnt warnings");
  413: logger("INFO", "$errcnt errors");
  414: if ($errcnt > 0) {
  415:     logger("ABORT", "there were errors!");
  416:     exit 1;
  417: } else {
  418:     logger("DONE", "done something successfully!");
  419: }
  420: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>