File:  [Repository] / foxridge-archiver / makemeta-vlp.pl
Revision 1.10: download - view: text, annotated - select for diffs - revision graph
Thu Mar 16 17:00:43 2017 UTC (7 years, 3 months ago) by casties
Branches: MAIN
CVS tags: HEAD
updated to Ubuntu Perl paths.

    1: #!/usr/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2.7 (27.8.2010 ROC)";
   14: my $help = 
   15: "use: makemeta-vlp [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -replace  replace existing index files
   20:   -online-mode  mode for creating online/permanent files
   21:   -archive-mode  mode for creating archive/data files
   22:   -access=free  adds free access tag for online-mode
   23:   -texttool adds texttool tag for online-mode
   24: ";
   25: logger("INFO", "makemeta-vlp $version");
   26: 
   27: ###########################################
   28: # mappings
   29: 
   30: # generic mappings at top level
   31: my %gen_map = (
   32:     'Custom2_Language' => 'meta/lang',
   33:     'productionComment' => 'meta/image-acquisition/production-comment',
   34:     'derivedFrom' => 'derived-from/archive-path'
   35:     );
   36: # sub type switch tag
   37: my %type_map = (
   38:     'ReferenceType' => 'meta/bib@type'
   39:     );
   40: # sub type mappings
   41: my %subtype_map = (
   42:     'Book' => {
   43: 	'_name' => 'book',
   44: 	'Author' => 'meta/bib/author',
   45: 	'Title' => 'meta/bib/title',
   46: 	'Year' => 'meta/bib/year',
   47: 	'Place_Published' => 'meta/bib/city',
   48: 	'Publisher' => 'meta/bib/publisher',
   49: 	'Edition' => 'meta/bib/edition',
   50: 	'Volume' => 'meta/bib/volume',
   51: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   52: 	'Pages' => 'meta/bib/number-of-pages'
   53:     },
   54:     '(Book)' => {
   55: 	'_name' => 'book',
   56: 	'Author' => 'meta/bib/author',
   57: 	'Title' => 'meta/bib/title',
   58: 	'Year' => 'meta/bib/year',
   59: 	'Place_Published' => 'meta/bib/city',
   60: 	'Publisher' => 'meta/bib/publisher',
   61: 	'Edition' => 'meta/bib/edition',
   62: 	'Volume' => 'meta/bib/volume',
   63: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   64: 	'Pages' => 'meta/bib/number-of-pages',
   65: 	'#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
   66:     },
   67:     'Book Section' => {
   68: 	'_name' => 'inbook',
   69: 	'Author' => 'meta/bib/author',
   70: 	'Title' => 'meta/bib/title',
   71: 	'Year' => 'meta/bib/year',
   72: 	'SecondaryTitle' => 'meta/bib/book-title',
   73: 	'SecondaryAuthor' => 'meta/bib/editor',
   74: 	'Volume' => 'meta/bib/volume',
   75: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   76: 	'Pages' => 'meta/bib/pages'
   77:     },
   78:     'Edited Book' => {
   79: 	'_name' => 'edited-book',
   80: 	'Author' => 'meta/bib/editor',
   81: 	'Title' => 'meta/bib/title',
   82: 	'Year' => 'meta/bib/year',
   83: 	'Place_Published' => 'meta/bib/city',
   84: 	'Publisher' => 'meta/bib/publisher',
   85: 	'Edition' => 'meta/bib/edition',
   86: 	'Volume' => 'meta/bib/volume',
   87: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
   88: 	'Pages' => 'meta/bib/number-of-pages',
   89: 	'#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
   90:     },
   91:     '(Edited Book)' => {
   92: 	'_name' => 'edited-book',
   93: 	'Author' => 'meta/bib/editor',
   94: 	'Title' => 'meta/bib/title',
   95: 	'Year' => 'meta/bib/year',
   96: 	'Place_Published' => 'meta/bib/city',
   97: 	'Publisher' => 'meta/bib/publisher',
   98: 	'Edition' => 'meta/bib/edition',
   99: 	'Volume' => 'meta/bib/volume',
  100: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
  101: 	'Pages' => 'meta/bib/number-of-pages'
  102:     },
  103:     'Journal Article' => {
  104: 	'_name' => 'journal-article',
  105: 	'Author' => 'meta/bib/author',
  106: 	'Title' => 'meta/bib/title',
  107: 	'Year' => 'meta/bib/year',
  108: 	'SecondaryTitle' => 'meta/bib/journal',
  109: 	'Volume' => 'meta/bib/volume',
  110: 	'Number_Issue' => 'meta/bib/issue',
  111: 	'Pages' => 'meta/bib/pages'
  112:     },
  113:     '(JournalVolume)' => {
  114: 	'_name' => 'journal-volume',
  115: 	'SecondaryTitle' => 'meta/bib/title',
  116: 	'SecondaryAuthor' => 'meta/bib/editor',
  117: 	'Publisher' => 'meta/bib/publisher',
  118: 	'Place_Published' => 'meta/bib/city',
  119: 	'Year' => 'meta/bib/year',
  120: 	'Volume' => 'meta/bib/volume',
  121: 	'Pages' => 'meta/bib/number-of-pages',
  122: 	'#Cover pages only, articles have been extracted' => 'meta/bib/comment'
  123:     },
  124:     'Journal' => {
  125: 	'_name' => 'report',
  126: 	'Title' => 'meta/bib/title',
  127: 	'SecondaryTitle' => 'meta/bib/institution',
  128: 	'Author' => 'meta/bib/author',
  129: 	'Place_Published' => 'meta/bib/city',
  130: 	'Year' => 'meta/bib/year',
  131: 	'Date' => 'meta/bib/date',
  132: 	'Pages' => 'meta/bib/pages',
  133:     },
  134:     'Magazine Article' => {
  135: 	'_name' => 'magazine-article',
  136: 	'Author' => 'meta/bib/author',
  137: 	'Title' => 'meta/bib/title',
  138: 	'Year' => 'meta/bib/year',
  139: 	'Secondary_Title' => 'meta/bib/magazine',
  140: 	'Number_Issue' => 'meta/bib/issue-number',
  141: 	'Date' => 'meta/bib/issue-date',
  142: 	'Pages' => 'meta/bib/pages'
  143:     },
  144:     'Newspaper Article' => {
  145: 	'_name' => 'newspaper-article',
  146: 	'Author' => 'meta/bib/author',
  147: 	'Title' => 'meta/bib/title',
  148: 	'Year' => 'meta/bib/year',
  149: 	'Secondary_Title' => 'meta/bib/newspaper',
  150: 	'Date' => 'meta/bib/issue-date',
  151: 	'Pages' => 'meta/bib/pages'
  152:     },
  153:     'Report' => {
  154: 	'_name' => 'report',
  155: 	'Author' => 'meta/bib/author',
  156: 	'Title' => 'meta/bib/title',
  157: 	'Year' => 'meta/bib/year',
  158: 	'Place_Published' => 'meta/bib/city',
  159: 	'Date' => 'meta/bib/date',
  160: 	'SecondaryTitle' => 'meta/bib/type',
  161: 	'Pages' => 'meta/bib/pages'
  162:     },
  163:     'Trade Catalogue' => {
  164: 	'_name' => 'report',
  165: 	'Author' => 'meta/bib/author',
  166: 	'Title' => 'meta/bib/title',
  167: 	'Year' => 'meta/bib/year',
  168: 	'Place_Published' => 'meta/bib/city',
  169: 	'Date' => 'meta/bib/date',
  170: 	'Volume' => 'meta/bib/volume',
  171: 	'NumberOfVolumes' => 'meta/bib/number-of-volumes',
  172: 	'ReferenceType' => 'meta/bib/type',
  173: 	'Pages' => 'meta/bib/pages'
  174:     },
  175:     'Thesis' => {
  176: 	'_name' => 'thesis',
  177: 	'Author' => 'meta/bib/author',
  178: 	'Title' => 'meta/bib/title',
  179: 	'Place_Published' => 'meta/bib/city',
  180: 	'Publisher' => 'meta/bib/university',
  181: 	'Date' => 'meta/bib/date',
  182: 	'TypeOfWork' => 'meta/bib/type',
  183: 	'Pages' => 'meta/bib/number-of-pages'
  184:     },
  185:     'Manuscript' => {
  186: 	'_name' => 'manuscript',
  187: 	'Author' => 'meta/bib/author',
  188: 	'Title' => 'meta/bib/title',
  189: 	'Year' => 'meta/bib/year',
  190: 	'Place_Published' => 'meta/bib/location',
  191: 	'Pages' => 'meta/bib/pages'
  192:     }
  193:     );
  194: # language element
  195: my $lang_field = 'Custom2_Language';
  196: # languages to iso codes
  197: my %lang_map = (
  198:     'German' => 'de',
  199:     'English' => 'en',
  200:     'Italian' => 'it',
  201:     'French' => 'fr',
  202:     'Latin' => 'la',
  203:     'Japanese' => 'ja',
  204:     'Dutch' => 'nl',
  205:     'Spanish' => 'es',
  206:     'Swedish' => 'sv',
  207:     'Russian' => 'ru',
  208:     'Polish' => 'pl',
  209:     'Greek' => 'el'
  210:     );
  211: # storage fields
  212: my $arch_id_field = 'ID';
  213: my $access_free_field = 'online';
  214: 
  215: #######################################################
  216: # internal parameters
  217: #
  218: 
  219: # storage
  220: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
  221: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
  222: 
  223: # read command line parameters
  224: my $args = MPIWGStor::parseargs;
  225: if (! scalar(%$args)) {
  226:     print $help, "\n";
  227:     exit 1;
  228: }
  229: 
  230: # debug level
  231: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  232: 
  233: # simulate action only
  234: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  235: logger('DEBUG', "dry-run: $dry_run");
  236: 
  237: # replace existing index files
  238: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
  239: logger('DEBUG', "replace: $do_replace");
  240: 
  241: # use online mode
  242: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
  243: logger('DEBUG', "online_mode: $online_mode");
  244: 
  245: # use archive mode
  246: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
  247: logger('DEBUG', "archive_mode: $archive_mode");
  248: 
  249: # create texttool tag (online mode only)
  250: my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1;
  251: logger('DEBUG', "texttool: $texttool");
  252: # image dir for texttool
  253: my $texttool_img_dir = "pages";
  254: 
  255: # access type
  256: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
  257: 
  258: # index.meta namespace (not really implemented!)
  259: my $namespace = "";
  260: 
  261: 
  262: my $xml_changed = 0;
  263: my $errcnt = 0;
  264: my $warncnt = 0;
  265: 
  266: #######################################################
  267: # check parameters that were passed to the program
  268: #
  269: my $infile = $$args{'path'};
  270: if (! $infile) {
  271:     logger("ABORT", "no input file given!");
  272:     exit 1;
  273: }
  274: # strip double slashes
  275: $infile =~ s/\/\//\//;
  276: if (! -f $infile) {
  277:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  278:     exit 1;
  279: }
  280: 
  281: 
  282: #######################################################
  283: # subroutines
  284: #
  285: 
  286: 
  287: sub find_arch_dir {
  288:     my ($input_node) = @_;
  289:     my $dir = "";
  290: 
  291:     my $bib_id = $input_node->findvalue("fm:$arch_id_field");
  292:     #logger('DEBUG', "bibdir: $bib_dir");
  293:     if ($bib_id) {
  294: 	$dir = "$lib_arch_dir/lit$bib_id";
  295: 	if (-d $dir) {
  296: 	    logger('DEBUG', "directory $dir exists"); 
  297: 	    return $dir;
  298: 	}
  299:     }
  300:     return;
  301: }
  302: 
  303: sub find_permanent_dir {
  304:     my ($input_node) = @_;
  305:     my $online_base = $lib_online_dir;
  306:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  307:     if (! $dest_id) {
  308: 	logger('ERROR', "no ID field for online permanent entry");
  309: 	$errcnt++;
  310: 	return;
  311:     }
  312:     my $dir = "$online_base/lit$dest_id";
  313:     if (-d $dir) {
  314:         logger('DEBUG', "directory $dir exists"); 
  315:         return $dir;
  316:     }
  317:     return;
  318: }
  319: 
  320: 
  321: sub convert_bib {
  322:     my ($input_node, $index_root, $index_doc) = @_;
  323:     my $cnt = 0;
  324:     my $type = "";
  325:     my $type_path = "";
  326: 
  327:     # process general stuff first
  328:     foreach my $n ($input_node->getChildNodes()) {
  329: 	my $name = $n->nodeName();
  330: 	my $val = $n->textContent();
  331: 	#logger('DEBUG', "  NODE: $name = '$val'");
  332: 	if (exists $gen_map{$name}) {
  333: 	    # is a general field
  334: 	    if ($name eq $lang_field) {
  335: 		# language field
  336: 		if (not $val) {
  337: 		    logger('WARNING', "no language tag");
  338: 		    $warncnt++;
  339: 		    next;
  340: 		}
  341: 		# convert to iso code
  342: 		if (exists $lang_map{$val}) {
  343: 		    $val = $lang_map{$val};
  344: 		} else {
  345: 		    logger('ERROR', "unknown language: $val! skipping...");
  346: 		    $errcnt++;
  347: 		    return 0;
  348: 		}
  349: 	    }
  350: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  351: 		->appendTextNode($val);
  352: 	    $cnt++;
  353: 	} elsif (exists $type_map{$name}) {
  354: 	    # is a type field
  355: 	    $type_path = $type_map{$name};
  356: 	    $type = $val;
  357: 	    # check with known types
  358: 	    if (exists $subtype_map{$val}) {
  359: 		my $indextype = $subtype_map{$val}->{'_name'};
  360: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  361: 		$cnt++;
  362: 	    } else {
  363: 		logger('ERROR', "unknown bib type $val! skipping...");
  364: 		$errcnt++;
  365: 		return 0;
  366: 	    }
  367: 	}
  368:     }
  369:     # process sub type fields
  370:     if ($type) {
  371: 	foreach my $n ($input_node->getChildNodes()) {
  372: 	    my $name = $n->nodeName();
  373: 	    my $val = $n->textContent();
  374: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  375: 	    if (exists $subtype_map{$type}->{$name}) {
  376: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  377: 		    ->appendTextNode($val);
  378: 		$cnt++;
  379: 	    }
  380: 	}
  381: 	# append additional constant fields (beginning with #)
  382: 	foreach my $k (keys %{$subtype_map{$type}}) {
  383: 	    if ($k =~ /^\#(.*)/) {
  384: 		my $val = $1;
  385: 		create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
  386: 	    }
  387: 	}
  388:     }
  389:     return $cnt;
  390: }
  391: 
  392: 
  393: 
  394: sub process_all_fm_entries {
  395:     my ($input_root) = @_;
  396:     my $cnt = 0;
  397: 
  398:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  399: 	logger('INFO', "processing entry $cnt ...");
  400: 	process_fm_entry($n);
  401: 	$cnt++;
  402:     }
  403: }    
  404: 
  405: 
  406: sub process_fm_entry {
  407:     my ($input_node) = @_;
  408:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  409:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  410:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  411:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  412:     $index_doc->setDocumentElement($index_root);
  413: 
  414:     # try to find the document directory
  415:     my $doc_dir = "";
  416:     if ($online_mode) {
  417: 	$doc_dir = find_permanent_dir($input_node);
  418:     } elsif ($archive_mode) {
  419: 	$doc_dir = find_arch_dir($input_node);
  420:     } else {
  421: 	$doc_dir = find_permanent_dir($input_node);
  422:     }
  423:     if (! $doc_dir) {
  424: 	logger('ERROR', "document directory not found! skipping...");
  425: 	$errcnt++;
  426: 	return;
  427:     }
  428: 
  429:     # check if index.meta exists
  430:     if ( -f "$doc_dir/index.meta") {
  431: 	if (not $do_replace) {
  432: 	    logger('DEBUG', "index file in $doc_dir exists");
  433: 	    return;
  434: 	}
  435:     }
  436: 
  437:     # add standard stuff to index.meta
  438:     my ($docname, $docpath) = split_file_path($doc_dir);
  439:     # name and date
  440:     create_text_path('name', $docname, $index_root, $namespace);
  441:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  442:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  443:     create_text_path('creator', 'vlp', $index_root, $namespace);
  444:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  445:     if ($archive_mode) {
  446:       # acquisition
  447:       create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  448:       create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
  449:       create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  450:     }
  451:     # media
  452:     create_text_path('media-type', 'image', $index_root, $namespace);
  453:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  454:     # access
  455:     if ($access_type) {
  456: 	if ($access_type eq "free") {
  457: 	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
  458: 	} else {
  459: 	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
  460: 	    create_text_path('name', $access_type, $acc_tag, $namespace);
  461: 	}
  462:     } elsif ($online_mode) {
  463:         # read access conditions from "online" field in DB dump
  464:         my $online = sstrip($input_node->findvalue("fm:$access_free_field"));
  465:         if ($online) {
  466: 	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
  467: 	} else {
  468: 	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
  469: 	    create_text_path('name', 'mpiwg', $acc_tag, $namespace);
  470: 	}
  471:     }
  472: 
  473:     # texttool tag with image dir
  474:     if ($online_mode && $texttool) {
  475: 	if ( -d "$doc_dir/$texttool_img_dir" ) {
  476: 	    create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace);
  477: 	} else {
  478:             logger('WARNING', "page image directory missing!");
  479:             $warncnt++;
  480:         }
  481:     }
  482: 
  483:     # convert bib entries
  484:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  485:     if ($cnt == 0) {
  486: 	# error or nothing to convert
  487: 	logger('ERROR', "no bibliographic metadata!");
  488: 	$errcnt++;
  489: 	return;
  490:     }
  491: 
  492:     # write new index.meta file
  493:     if ($dry_run) {
  494: 	logger('DEBUG', "would write $doc_dir/index.meta");
  495: 	logger('DEBUG', $index_doc->toString(1));
  496:     } else {
  497: 	write_xml($index_doc, "$doc_dir/index.meta");
  498:     }
  499: 
  500: }
  501: 
  502: 
  503: 
  504: 
  505: 
  506: #######################################################
  507: # Main
  508: #
  509: 
  510: # load filemaker xml dump
  511: my ($input_doc, $input_root) = read_xml($infile);
  512: # set namespace prefix
  513: my $fm_namespace = $input_root->namespaceURI();
  514: $input_root->setNamespace($fm_namespace, 'fm', 1);
  515: 
  516: 
  517: process_all_fm_entries($input_root);
  518: 
  519: 
  520: logger("INFO", "$warncnt warnings");
  521: logger("INFO", "$errcnt errors");
  522: if ($errcnt > 0) {
  523:     logger("ABORT", "there were errors!");
  524:     exit 1;
  525: } else {
  526:     logger("DONE", "done something successfully!");
  527: }
  528: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>