File:  [Repository] / foxridge-archiver / makemeta-vlp.pl
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Tue Jun 13 15:04:27 2006 UTC (18 years ago) by casties
Branches: MAIN
CVS tags: HEAD
added document type (Book)

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2.1 (12.6.2006 ROC)";
   14: my $help = 
   15: "use: makemeta-vlp [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -replace  replace existing index files
   20:   -online-mode  mode for creating online/permanent files
   21:   -archive-mode  mode for creating archive/data files
   22: ";
   23: logger("INFO", "makemeta-vlp $version");
   24: 
   25: ###########################################
   26: # mappings
   27: 
   28: # generic mappings at top level
   29: my %gen_map = (
   30:     'Custom2_Language' => 'meta/lang'
   31:     );
   32: # sub type switch tag
   33: my %type_map = (
   34:     'ReferenceType' => 'meta/bib@type'
   35:     );
   36: # sub type mappings
   37: my %subtype_map = (
   38:     'Book' => {
   39: 	'_name' => 'book',
   40: 	'Author' => 'meta/bib/author',
   41: 	'Title' => 'meta/bib/title',
   42: 	'Year' => 'meta/bib/year',
   43: 	'Place_Published' => 'meta/bib/city',
   44: 	'Publisher' => 'meta/bib/publisher',
   45: 	'Edition' => 'meta/bib/edition',
   46: 	'Volume' => 'meta/bib/volume',
   47: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   48: 	'Pages' => 'meta/bib/number-of-pages'
   49:     },
   50:     '(Book)' => {
   51: 	'_name' => 'book',
   52: 	'Author' => 'meta/bib/author',
   53: 	'Title' => 'meta/bib/title',
   54: 	'Year' => 'meta/bib/year',
   55: 	'Place_Published' => 'meta/bib/city',
   56: 	'Publisher' => 'meta/bib/publisher',
   57: 	'Edition' => 'meta/bib/edition',
   58: 	'Volume' => 'meta/bib/volume',
   59: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   60: 	'Pages' => 'meta/bib/number-of-pages',
   61: 	'#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
   62:     },
   63:     'Book Section' => {
   64: 	'_name' => 'inbook',
   65: 	'Author' => 'meta/bib/author',
   66: 	'Title' => 'meta/bib/title',
   67: 	'Year' => 'meta/bib/year',
   68: 	'SecondaryTitle' => 'meta/bib/book-title',
   69: 	'SecondaryAuthor' => 'meta/bib/editor',
   70: 	'Volume' => 'meta/bib/volume',
   71: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   72: 	'Pages' => 'meta/bib/pages'
   73:     },
   74:     'Edited Book' => {
   75: 	'_name' => 'edited-book',
   76: 	'Author' => 'meta/bib/editor',
   77: 	'Title' => 'meta/bib/title',
   78: 	'Year' => 'meta/bib/year',
   79: 	'Place_Published' => 'meta/bib/city',
   80: 	'Publisher' => 'meta/bib/publisher',
   81: 	'Edition' => 'meta/bib/edition',
   82: 	'Volume' => 'meta/bib/volume',
   83: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   84: 	'Pages' => 'meta/bib/number-of-pages'
   85:     },
   86:     'Journal Article' => {
   87: 	'_name' => 'journal-article',
   88: 	'Author' => 'meta/bib/author',
   89: 	'Title' => 'meta/bib/title',
   90: 	'Year' => 'meta/bib/year',
   91: 	'SecondaryTitle' => 'meta/bib/journal',
   92: 	'Volume' => 'meta/bib/volume',
   93: 	'Number_Issue' => 'meta/bib/issue',
   94: 	'Pages' => 'meta/bib/pages'
   95:     },
   96:     'Magazine Article' => {
   97: 	'_name' => 'magazine-article',
   98: 	'Author' => 'meta/bib/author',
   99: 	'Title' => 'meta/bib/title',
  100: 	'Year' => 'meta/bib/year',
  101: 	'Secondary_Title' => 'meta/bib/magazine',
  102: 	'Number_Issue' => 'meta/bib/issue-number',
  103: 	'Date' => 'meta/bib/issue-date',
  104: 	'Pages' => 'meta/bib/pages'
  105:     },
  106:     'Report' => {
  107: 	'_name' => 'report',
  108: 	'Author' => 'meta/bib/author',
  109: 	'Title' => 'meta/bib/title',
  110: 	'Year' => 'meta/bib/year',
  111: 	'Place_Published' => 'meta/bib/city',
  112: 	'Date' => 'meta/bib/date',
  113: 	'SecondaryTitle' => 'meta/bib/type',
  114: 	'Pages' => 'meta/bib/pages'
  115:     },
  116:     'Trade Catalogue' => {
  117: 	'_name' => 'report',
  118: 	'Author' => 'meta/bib/author',
  119: 	'Title' => 'meta/bib/title',
  120: 	'Year' => 'meta/bib/year',
  121: 	'Place_Published' => 'meta/bib/city',
  122: 	'Date' => 'meta/bib/date',
  123: 	'Volume' => 'meta/bib/volume',
  124: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
  125: 	'ReferenceType' => 'meta/bib/type',
  126: 	'Pages' => 'meta/bib/pages'
  127:     },
  128:     'Thesis' => {
  129: 	'_name' => 'thesis',
  130: 	'Author' => 'meta/bib/author',
  131: 	'Title' => 'meta/bib/title',
  132: 	'Place_Published' => 'meta/bib/city',
  133: 	'Publisher' => 'meta/bib/university',
  134: 	'Date' => 'meta/bib/date',
  135: 	'TypeOfWork' => 'meta/bib/type',
  136: 	'Pages' => 'meta/bib/number-of-pages'
  137:     },
  138:     'Manuscript' => {
  139: 	'_name' => 'manuscript',
  140: 	'Author' => 'meta/bib/author',
  141: 	'Title' => 'meta/bib/title',
  142: 	'Year' => 'meta/bib/year',
  143: 	'Place_Published' => 'meta/bib/location',
  144: 	'Pages' => 'meta/bib/pages'
  145:     }
  146:     );
  147: # language element
  148: my $lang_field = 'Custom2_Language';
  149: # languages to iso codes
  150: my %lang_map = (
  151:     'German' => 'de',
  152:     'English' => 'en',
  153:     'Italian' => 'it',
  154:     'French' => 'fr',
  155:     'Latin' => 'la',
  156:     'Japanese' => 'ja',
  157:     'Dutch' => 'nl',
  158:     'Spanish' => 'es',
  159:     'Swedish' => 'sv'
  160:     );
  161: # storage fields
  162: my $arch_id_field = 'ID';
  163: 
  164: #######################################################
  165: # internal parameters
  166: #
  167: 
  168: # storage
  169: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
  170: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
  171: 
  172: # read command line parameters
  173: my $args = MPIWGStor::parseargs;
  174: if (! scalar(%$args)) {
  175:     print $help, "\n";
  176:     exit 1;
  177: }
  178: 
  179: # debug level
  180: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  181: 
  182: # simulate action only
  183: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  184: logger('DEBUG', "dry-run: $dry_run");
  185: 
  186: # replace existing index files
  187: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
  188: logger('DEBUG', "replace: $do_replace");
  189: 
  190: # use online mode
  191: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
  192: logger('DEBUG', "online_mode: $online_mode");
  193: 
  194: # use archive mode
  195: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
  196: logger('DEBUG', "archive_mode: $archive_mode");
  197: 
  198: # index.meta namespace (not really implemented!)
  199: my $namespace = "";
  200: 
  201: 
  202: my $xml_changed = 0;
  203: my $errcnt = 0;
  204: my $warncnt = 0;
  205: 
  206: #######################################################
  207: # check parameters that were passed to the program
  208: #
  209: my $infile = $$args{'path'};
  210: if (! $infile) {
  211:     logger("ABORT", "no input file given!");
  212:     exit 1;
  213: }
  214: # strip double slashes
  215: $infile =~ s/\/\//\//;
  216: if (! -f $infile) {
  217:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  218:     exit 1;
  219: }
  220: 
  221: 
  222: #######################################################
  223: # subroutines
  224: #
  225: 
  226: 
  227: sub find_arch_dir {
  228:     my ($input_node) = @_;
  229:     my $dir = "";
  230: 
  231:     my $bib_id = $input_node->findvalue("fm:$arch_id_field");
  232:     #logger('DEBUG', "bibdir: $bib_dir");
  233:     if ($bib_id) {
  234: 	$dir = "$lib_arch_dir/lit$bib_id";
  235: 	if (-d $dir) {
  236: 	    logger('DEBUG', "directory $dir exists"); 
  237: 	    return $dir;
  238: 	}
  239:     }
  240:     return;
  241: }
  242: 
  243: sub find_permanent_dir {
  244:     my ($input_node) = @_;
  245:     my $online_base = '/mpiwg/online/permanent';
  246:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  247:     if (! $dest_id) {
  248: 	logger('ERROR', "no ID field for online permanent entry");
  249: 	$errcnt++;
  250: 	return;
  251:     }
  252:     my $dir = "$online_base/lit$dest_id";
  253:     return $dir;
  254: }
  255: 
  256: 
  257: sub convert_bib {
  258:     my ($input_node, $index_root, $index_doc) = @_;
  259:     my $cnt = 0;
  260:     my $type = "";
  261:     my $type_path = "";
  262: 
  263:     # process general stuff first
  264:     foreach my $n ($input_node->getChildNodes()) {
  265: 	my $name = $n->nodeName();
  266: 	my $val = $n->textContent();
  267: 	#logger('DEBUG', "  NODE: $name = '$val'");
  268: 	if (exists $gen_map{$name}) {
  269: 	    # is a general field
  270: 	    if ($name eq $lang_field) {
  271: 		# language field
  272: 		if (not $val) {
  273: 		    logger('WARNING', "no language tag");
  274: 		    $warncnt++;
  275: 		    next;
  276: 		}
  277: 		# convert to iso code
  278: 		if (exists $lang_map{$val}) {
  279: 		    $val = $lang_map{$val};
  280: 		} else {
  281: 		    logger('ERROR', "unknown language: $val! skipping...");
  282: 		    $errcnt++;
  283: 		    return 0;
  284: 		}
  285: 	    }
  286: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  287: 		->appendTextNode($val);
  288: 	    $cnt++;
  289: 	} elsif (exists $type_map{$name}) {
  290: 	    # is a type field
  291: 	    $type_path = $type_map{$name};
  292: 	    $type = $val;
  293: 	    # check with known types
  294: 	    if (exists $subtype_map{$val}) {
  295: 		my $indextype = $subtype_map{$val}->{'_name'};
  296: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  297: 		$cnt++;
  298: 	    } else {
  299: 		logger('ERROR', "unknown bib type $val! skipping...");
  300: 		$errcnt++;
  301: 		return 0;
  302: 	    }
  303: 	}
  304:     }
  305:     # process sub type fields
  306:     if ($type) {
  307: 	foreach my $n ($input_node->getChildNodes()) {
  308: 	    my $name = $n->nodeName();
  309: 	    my $val = $n->textContent();
  310: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  311: 	    if (exists $subtype_map{$type}->{$name}) {
  312: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  313: 		    ->appendTextNode($val);
  314: 		$cnt++;
  315: 	    }
  316: 	}
  317: 	# append additional constant fields (beginning with #)
  318: 	foreach my $k (keys %{$subtype_map{$type}}) {
  319: 	    if ($k =~ /^\#(.*)/) {
  320: 		my $val = $1;
  321: 		create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
  322: 	    }
  323: 	}
  324:     }
  325:     return $cnt;
  326: }
  327: 
  328: 
  329: 
  330: sub process_all_fm_entries {
  331:     my ($input_root) = @_;
  332:     my $cnt = 0;
  333: 
  334:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  335: 	logger('INFO', "processing entry $cnt ...");
  336: 	process_fm_entry($n);
  337: 	$cnt++;
  338:     }
  339: }    
  340: 
  341: 
  342: sub process_fm_entry {
  343:     my ($input_node) = @_;
  344:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  345:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  346:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  347:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  348:     $index_doc->setDocumentElement($index_root);
  349: 
  350:     # try to find the document directory
  351:     my $doc_dir = "";
  352:     if ($online_mode) {
  353: 	$doc_dir = find_permanent_dir($input_node);
  354:     } elsif ($archive_mode) {
  355: 	$doc_dir = find_arch_dir($input_node);
  356:     } else {
  357: 	$doc_dir = find_permanent_dir($input_node);
  358:     }
  359:     if (! $doc_dir) {
  360: 	logger('ERROR', "document directory not found! skipping...");
  361: 	$errcnt++;
  362: 	return;
  363:     }
  364: 
  365:     # check if index.meta exists
  366:     if ( -f "$doc_dir/index.meta") {
  367: 	if (not $do_replace) {
  368: 	    logger('DEBUG', "index file in $doc_dir exists");
  369: 	    return;
  370: 	}
  371:     }
  372: 
  373:     # add standard stuff to index.meta
  374:     my ($docname, $docpath) = split_file_path($doc_dir);
  375:     # name and date
  376:     create_text_path('name', $docname, $index_root, $namespace);
  377:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  378:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  379:     create_text_path('creator', 'vlp', $index_root, $namespace);
  380:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  381:     if ($archive_mode) {
  382:       # acquisition
  383:       create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  384:       create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
  385:       create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  386:       # image acquisition
  387:       create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
  388:       create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
  389:       create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
  390:     }
  391:     # media
  392:     create_text_path('media-type', 'image', $index_root, $namespace);
  393:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  394: 
  395:     # convert bib entries
  396:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  397:     if ($cnt == 0) {
  398: 	# error or nothing to convert
  399: 	logger('ERROR', "no bibliographic metadata!");
  400: 	$errcnt++;
  401: 	return;
  402:     }
  403: 
  404:     # write new index.meta file
  405:     if ($dry_run) {
  406: 	logger('DEBUG', "would write $doc_dir/index.meta");
  407: 	logger('DEBUG', $index_doc->toString(1));
  408:     } else {
  409: 	write_xml($index_doc, "$doc_dir/index.meta");
  410:     }
  411: 
  412: }
  413: 
  414: 
  415: 
  416: 
  417: 
  418: #######################################################
  419: # Main
  420: #
  421: 
  422: # load filemaker xml dump
  423: my ($input_doc, $input_root) = read_xml($infile);
  424: # set namespace prefix
  425: my $fm_namespace = $input_root->namespaceURI();
  426: $input_root->setNamespace($fm_namespace, 'fm', 1);
  427: 
  428: 
  429: process_all_fm_entries($input_root);
  430: 
  431: 
  432: logger("INFO", "$warncnt warnings");
  433: logger("INFO", "$errcnt errors");
  434: if ($errcnt > 0) {
  435:     logger("ABORT", "there were errors!");
  436:     exit 1;
  437: } else {
  438:     logger("DONE", "done something successfully!");
  439: }
  440: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>