File:  [Repository] / foxridge-archiver / makemeta-lib.pl
Revision 1.4: download - view: text, annotated - select for diffs - revision graph
Mon Jun 20 15:21:30 2005 UTC (19 years ago) by casties
Branches: MAIN
CVS tags: HEAD
new helper script for calling archiver on many directories

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.1.1 (1.6.2005)";
   14: logger("INFO", "makemeta-lib $version");
   15: 
   16: ###########################################
   17: # mappings
   18: 
   19: # generic mappings at top level
   20: my %gen_map = (
   21:     'Device' => 'meta/image-acquisition/device',
   22:     'Image_Type' => 'meta/image-acquisition/image-type',
   23:     'Production_Comment' => 'meta/image-acquisition/production-comment',
   24:     'Postproduction' => 'meta/image-acquisition/production-comment',
   25:     'Language' => 'meta/lang'
   26:     );
   27: # sub type switch tag
   28: my %type_map = (
   29:     'Reference_Type' => 'meta/bib@type'
   30:     );
   31: # sub type mappings
   32: my %subtype_map = (
   33:     'Book' => {
   34: 	'_name' => 'book',
   35: 	'Author' => 'meta/bib/author',
   36: 	'Title' => 'meta/bib/title',
   37: 	'Year' => 'meta/bib/year',
   38: 	'Place_Published' => 'meta/bib/city',
   39: 	'Publisher' => 'meta/bib/publisher',
   40: 	'Edition' => 'meta/bib/edition'
   41:     },
   42:     'Journal Article' => {
   43: 	'_name' => 'journal-article',
   44: 	'Author' => 'meta/bib/author',
   45: 	'Title' => 'meta/bib/title',
   46: 	'Year' => 'meta/bib/year',
   47: 	'Secondary_Title' => 'meta/bib/journal',
   48: 	'Volume' => 'meta/bib/volume',
   49: 	'Number' => 'meta/bib/issue',
   50: 	'Pages' => 'meta/bib/pages'
   51:     },
   52:     'In Book' => {
   53: 	'_name' => 'inbook',
   54: 	'Author' => 'meta/bib/author',
   55: 	'Title' => 'meta/bib/title',
   56: 	'Year' => 'meta/bib/year',
   57: 	'Secondary_Title' => 'meta/bib/book-title',
   58: 	'Pages' => 'meta/bib/pages'
   59:     },
   60:     'Newspaper Article' => {
   61: 	'_name' => 'newspaper-article',
   62: 	'Author' => 'meta/bib/author',
   63: 	'Title' => 'meta/bib/title',
   64: 	'Year' => 'meta/bib/year',
   65: 	'Secondary_Title' => 'meta/bib/newspaper',
   66: 	'Place_Published' => 'meta/bib/city',
   67: 	'Number' => 'meta/bib/issue-date',
   68: 	'Pages' => 'meta/bib/pages'
   69:     },
   70:     'Edited Book' => {
   71: 	'_name' => 'edited-book',
   72: 	'Author' => 'meta/bib/editor',
   73: 	'Title' => 'meta/bib/title',
   74: 	'Year' => 'meta/bib/year',
   75: 	'Place_Published' => 'meta/bib/city',
   76: 	'Publisher' => 'meta/bib/publisher',
   77: 	'Edition' => 'meta/bib/edition'
   78:     },
   79:     'Manuscript' => {
   80: 	'_name' => 'manuscript',
   81: 	'Author' => 'meta/bib/author',
   82: 	'Title' => 'meta/bib/title',
   83: 	'Year' => 'meta/bib/year',
   84: 	'Place_Published' => 'meta/bib/location',
   85:     }
   86:     );
   87: # language element
   88: my $lang_field = 'Language';
   89: # languages to iso codes
   90: my %lang_map = (
   91:     'German' => 'de',
   92:     'English' => 'en',
   93:     'Italian' => 'it',
   94:     'French' => 'fr',
   95:     'Latin' => 'la',
   96:     'Japanese' => 'ja',
   97:     'Dutch' => 'nl',
   98:     'Spanish' => 'es'
   99:     );
  100: # storage fields
  101: my $arch_id_field = 'ID_Archive';
  102: my $online_url_field = 'URL';
  103: 
  104: #######################################################
  105: # internal parameters
  106: #
  107: 
  108: # storage
  109: my $lib_arch_dir = '/mpiwg/archive/data/library';
  110: my $lib_online_dir = '/mpiwg/online/permanent';
  111: 
  112: # read command line parameters
  113: my $args = MPIWGStor::parseargs;
  114: 
  115: # debug level
  116: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  117: 
  118: # use einstein-cw mode
  119: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
  120: 
  121: # index.meta namespace (not really implemented!)
  122: my $namespace = "";
  123: 
  124: 
  125: my $xml_changed = 0;
  126: my $errcnt = 0;
  127: my $warncnt = 0;
  128: 
  129: #######################################################
  130: # check parameters that were passed to the program
  131: #
  132: my $infile = $$args{'path'};
  133: if (! $infile) {
  134:     logger("ABORT", "no input file given!");
  135:     exit 1;
  136: }
  137: # strip double slashes
  138: $infile =~ s/\/\//\//;
  139: if (! -f $infile) {
  140:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  141:     exit 1;
  142: }
  143: 
  144: 
  145: #######################################################
  146: # subroutines
  147: #
  148: 
  149: sub find_cw_dir {
  150:     my ($input_node) = @_;
  151:     my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup');
  152:     my $dest_id = $input_node->findvalue("fm:$arch_id_field");
  153:     if (! $dest_id) {
  154: 	logger('ERROR', "no ID field for einstein-cw entry");
  155: 	$errcnt++;
  156: 	return;
  157:     }
  158:     my $dir = "$lib_arch_dir/$dest_id";
  159:     logger('DEBUG', "moving $src_dir to $dir");
  160:     if (rename $src_dir, $dir) {
  161: 	if (-d $dir) {
  162: 	    logger('DEBUG', "directory $dir OK"); 
  163: 	    return $dir;
  164: 	}
  165:     } else {
  166: 	logger('ABORT', "unable to rename directory $src_dir to $dir!");
  167: 	exit 1;
  168:     }
  169:     return;
  170: }
  171: 
  172: sub find_online_dir {
  173:     my ($input_node, $base_dir) = @_;
  174:     $base_dir = $lib_online_dir unless ($base_dir);
  175: 
  176:     my $online_url = $input_node->findvalue("fm:$online_url_field");
  177:     if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) {
  178: 	my $online_dir = $1;
  179: 	#logger("DEBUG", "dir: $base_dir/$online_dir");
  180: 	my $dir = "$base_dir/$online_dir";
  181: 	if (-d $dir) {
  182: 	    logger('DEBUG', "directory $dir exists"); 
  183: 	    return $dir;
  184: 	}
  185:     }
  186:     return;
  187: }
  188: 
  189: sub find_arch_dir {
  190:     my ($input_node) = @_;
  191:     my $dir = "";
  192: 
  193:     my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
  194:     #logger('DEBUG', "bibdir: $bib_dir");
  195:     if ($bib_dir) {
  196: 	$dir = "$lib_arch_dir/$bib_dir";
  197: 	if (-d $dir) {
  198: 	    logger('DEBUG', "directory $dir exists"); 
  199: 	    return $dir;
  200: 	}
  201:     }
  202:     return;
  203: }
  204: 
  205: 
  206: sub convert_bib {
  207:     my ($input_node, $index_root, $index_doc) = @_;
  208:     my $cnt = 0;
  209:     my $type = "";
  210:     my $type_path = "";
  211: 
  212:     # process general stuff first
  213:     foreach my $n ($input_node->getChildNodes()) {
  214: 	my $name = $n->nodeName();
  215: 	my $val = $n->textContent();
  216: 	#logger('DEBUG', "  NODE: $name = '$val'");
  217: 	if (exists $gen_map{$name}) {
  218: 	    # is a general field
  219: 	    if ($name eq $lang_field) {
  220: 		# language field -> convert to iso code
  221: 		if (exists $lang_map{$val}) {
  222: 		    $val = $lang_map{$val};
  223: 		} else {
  224: 		    logger('ERROR', "unknown language: $val! skipping...");
  225: 		    $errcnt++;
  226: 		    return 0;
  227: 		}
  228: 	    }
  229: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  230: 		->appendTextNode($val);
  231: 	    $cnt++;
  232: 	} elsif (exists $type_map{$name}) {
  233: 	    # is a type field
  234: 	    $type_path = $type_map{$name};
  235: 	    $type = $val;
  236: 	    # check with known types
  237: 	    if (exists $subtype_map{$val}) {
  238: 		my $indextype = $subtype_map{$val}->{'_name'};
  239: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  240: 		$cnt++;
  241: 	    } else {
  242: 		logger('ERROR', 'unknown bib type $val! skipping...');
  243: 		$errcnt++;
  244: 		return 0;
  245: 	    }
  246: 	}
  247:     }
  248:     # process sub type fields
  249:     if ($type) {
  250: 	foreach my $n ($input_node->getChildNodes()) {
  251: 	    my $name = $n->nodeName();
  252: 	    my $val = $n->textContent();
  253: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  254: 	    if (exists $subtype_map{$type}->{$name}) {
  255: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  256: 		    ->appendTextNode($val);
  257: 		$cnt++;
  258: 	    }
  259: 	}
  260:     }
  261:     return $cnt;
  262: }
  263: 
  264: 
  265: 
  266: sub process_all_fm_entries {
  267:     my ($input_root) = @_;
  268:     my $cnt = 0;
  269: 
  270:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  271: 	logger('INFO', "processing entry $cnt ...");
  272: 	process_fm_entry($n);
  273:     }
  274: }    
  275: 
  276: 
  277: sub process_fm_entry {
  278:     my ($input_node) = @_;
  279:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  280:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  281:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  282:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  283:     $index_doc->setDocumentElement($index_root);
  284: 
  285:     # try to find the document directory
  286:     my $doc_dir = "";
  287:     if ($cw_mode) {
  288: 	$doc_dir = find_cw_dir($input_node);
  289:     } else {
  290: 	$doc_dir = find_arch_dir($input_node);
  291:     }
  292:     if (! $doc_dir) {
  293: 	logger('ERROR', "document directory not found! skipping...");
  294: 	$errcnt++;
  295: 	return;
  296:     }
  297: 
  298:     # add standard stuff to index.meta
  299:     my ($docname, $docpath) = split_file_path($doc_dir);
  300:     # name and date
  301:     create_text_path('name', $docname, $index_root, $namespace);
  302:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  303:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  304:     create_text_path('creator', 'digigroup', $index_root, $namespace);
  305:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  306:     # acquisition
  307:     create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  308:     create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
  309:     create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  310:     # media
  311:     create_text_path('media-type', 'image', $index_root, $namespace);
  312:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  313: 
  314:     # convert bib entries
  315:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  316:     if ($cnt == 0) {
  317: 	# error or nothing to convert
  318: 	logger('ERROR', "no bibliographic metadata!");
  319: 	$errcnt++;
  320: 	return;
  321:     }
  322: 
  323:     # write new index.meta file
  324:     write_xml($index_doc, "$doc_dir/index.meta");
  325: 
  326: }
  327: 
  328: 
  329: 
  330: 
  331: 
  332: #######################################################
  333: # Main
  334: #
  335: 
  336: # load filemaker xml dump
  337: my ($input_doc, $input_root) = read_xml($infile);
  338: # set namespace prefix
  339: my $fm_namespace = $input_root->namespaceURI();
  340: $input_root->setNamespace($fm_namespace, 'fm', 1);
  341: 
  342: process_all_fm_entries($input_root);
  343: 
  344: 
  345: logger("INFO", "$warncnt warnings");
  346: logger("INFO", "$errcnt errors");
  347: if ($errcnt > 0) {
  348:     logger("ABORT", "there were errors!");
  349:     exit 1;
  350: } else {
  351:     logger("DONE", "done something successfully!");
  352: }
  353: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>