File:  [Repository] / foxridge-archiver / makemeta-lib.pl
Revision 1.7: download - view: text, annotated - select for diffs - revision graph
Tue Sep 20 17:44:48 2005 UTC (18 years, 8 months ago) by casties
Branches: MAIN
CVS tags: HEAD
corrected perl lib path

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2.2 (31.8.2005 ROC)";
   14: my $help = 
   15: "use: makemeta-lib [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -online-mode  mode for creating online/permanent files
   20:   -cw-mode  mode for copying einstein_cw archive documents
   21:   -digifiles-mode  mode for copying files from digifiles
   22:   -map-file=mapfile.xml  digilib mapping file (for digifiles mode)
   23: ";
   24: logger("INFO", "makemeta-lib $version");
   25: 
   26: ###########################################
   27: # mappings
   28: 
   29: # generic mappings at top level
   30: my %gen_map = (
   31:     'Device' => 'meta/image-acquisition/device',
   32:     'Image_Type' => 'meta/image-acquisition/image-type',
   33:     'Production_Comment' => 'meta/image-acquisition/production-comment',
   34:     'Postproduction' => 'meta/image-acquisition/production-comment',
   35:     'Language' => 'meta/lang'
   36:     );
   37: # sub type switch tag
   38: my %type_map = (
   39:     'Reference_Type' => 'meta/bib@type'
   40:     );
   41: # sub type mappings
   42: my %subtype_map = (
   43:     'Book' => {
   44: 	'_name' => 'book',
   45: 	'Author' => 'meta/bib/author',
   46: 	'Title' => 'meta/bib/title',
   47: 	'Year' => 'meta/bib/year',
   48: 	'Place_Published' => 'meta/bib/city',
   49: 	'Publisher' => 'meta/bib/publisher',
   50: 	'Edition' => 'meta/bib/edition'
   51:     },
   52:     'Journal Article' => {
   53: 	'_name' => 'journal-article',
   54: 	'Author' => 'meta/bib/author',
   55: 	'Title' => 'meta/bib/title',
   56: 	'Year' => 'meta/bib/year',
   57: 	'Secondary_Title' => 'meta/bib/journal',
   58: 	'Volume' => 'meta/bib/volume',
   59: 	'Number' => 'meta/bib/issue',
   60: 	'Pages' => 'meta/bib/pages'
   61:     },
   62:     'In Book' => {
   63: 	'_name' => 'inbook',
   64: 	'Author' => 'meta/bib/author',
   65: 	'Title' => 'meta/bib/title',
   66: 	'Year' => 'meta/bib/year',
   67: 	'Secondary_Title' => 'meta/bib/book-title',
   68: 	'Pages' => 'meta/bib/pages'
   69:     },
   70:     'Newspaper Article' => {
   71: 	'_name' => 'newspaper-article',
   72: 	'Author' => 'meta/bib/author',
   73: 	'Title' => 'meta/bib/title',
   74: 	'Year' => 'meta/bib/year',
   75: 	'Secondary_Title' => 'meta/bib/newspaper',
   76: 	'Place_Published' => 'meta/bib/city',
   77: 	'Number' => 'meta/bib/issue-date',
   78: 	'Pages' => 'meta/bib/pages'
   79:     },
   80:     'Edited Book' => {
   81: 	'_name' => 'edited-book',
   82: 	'Author' => 'meta/bib/editor',
   83: 	'Title' => 'meta/bib/title',
   84: 	'Year' => 'meta/bib/year',
   85: 	'Place_Published' => 'meta/bib/city',
   86: 	'Publisher' => 'meta/bib/publisher',
   87: 	'Edition' => 'meta/bib/edition'
   88:     },
   89:     'Manuscript' => {
   90: 	'_name' => 'manuscript',
   91: 	'Author' => 'meta/bib/author',
   92: 	'Title' => 'meta/bib/title',
   93: 	'Year' => 'meta/bib/year',
   94: 	'Place_Published' => 'meta/bib/location',
   95:     }
   96:     );
   97: # language element
   98: my $lang_field = 'Language';
   99: # languages to iso codes
  100: my %lang_map = (
  101:     'German' => 'de',
  102:     'English' => 'en',
  103:     'Italian' => 'it',
  104:     'French' => 'fr',
  105:     'Latin' => 'la',
  106:     'Japanese' => 'ja',
  107:     'Dutch' => 'nl',
  108:     'Spanish' => 'es'
  109:     );
  110: # storage fields
  111: my $arch_id_field = 'ID_Archive';
  112: my $online_url_field = 'URL';
  113: my $online_id_field = 'ID_OnlinePermanent';
  114: 
  115: #######################################################
  116: # internal parameters
  117: #
  118: 
  119: # storage
  120: my $lib_arch_dir = '/mpiwg/archive/data/library';
  121: my $lib_online_dir = '/mpiwg/online/permanent';
  122: my $lib_digilib_path = 'permanent';
  123: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
  124: 
  125: # read command line parameters
  126: my $args = MPIWGStor::parseargs;
  127: if (! scalar(%$args)) {
  128:     print $help, "\n";
  129:     exit 1;
  130: }
  131: 
  132: # debug level
  133: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  134: 
  135: # simulate action only
  136: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  137: logger('DEBUG', "dry-run: $dry_run");
  138: 
  139: # use online mode
  140: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
  141: logger('DEBUG', "online_mode: $online_mode");
  142: 
  143: # use einstein-cw mode
  144: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
  145: logger('DEBUG', "cw_mode: $cw_mode");
  146: 
  147: # use digifiles mode
  148: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
  149: logger('DEBUG', "digifiles_mode: $digifiles_mode");
  150: # digilib mapping file
  151: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
  152: logger('DEBUG', "map_file_name: $map_file_name");
  153: my $mapping_doc;
  154: my $mapping_root;
  155: 
  156: # index.meta namespace (not really implemented!)
  157: my $namespace = "";
  158: 
  159: 
  160: my $xml_changed = 0;
  161: my $errcnt = 0;
  162: my $warncnt = 0;
  163: 
  164: #######################################################
  165: # check parameters that were passed to the program
  166: #
  167: my $infile = $$args{'path'};
  168: if (! $infile) {
  169:     logger("ABORT", "no input file given!");
  170:     exit 1;
  171: }
  172: # strip double slashes
  173: $infile =~ s/\/\//\//;
  174: if (! -f $infile) {
  175:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  176:     exit 1;
  177: }
  178: 
  179: 
  180: #######################################################
  181: # subroutines
  182: #
  183: 
  184: 
  185: sub add_digilib_mapping {
  186:     my ($src_dir, $dest_dir) = @_;
  187:     my $elem = $mapping_root->addNewChild($namespace, 'mapping');
  188:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
  189:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
  190:     if ($map_file_name) {
  191: 	write_xml($mapping_doc, $map_file_name);
  192:     } else {
  193: 	logger('ABORT', "unable to write mapping file!");
  194: 	exit 1;
  195:     }
  196: }
  197: 
  198: sub find_digifiles_dir {
  199:     my ($input_node) = @_;
  200:     my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
  201:     my $src_dir = find_online_dir($input_node, $digifiles_base, '');
  202:     if (! $src_dir) {
  203: 	logger('ERROR', "no online directory for digifiles entry");
  204: 	$errcnt++;
  205: 	return;
  206:     }
  207:     my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
  208:     if (! $dest_id) {
  209: 	logger('ERROR', "no ID field for digifiles entry");
  210: 	$errcnt++;
  211: 	return;
  212:     }
  213:     my $dir = "$lib_online_dir/library/$dest_id";
  214:     my $map_dir = "$lib_digilib_path/library/$dest_id";
  215:     if ($dry_run) {
  216: 	logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
  217: 	add_digilib_mapping($src_dir, "$map_dir/pageimg");
  218: 	return $dir;
  219:     } else {
  220: 	logger('INFO', "moving $digifiles_base/$src_dir to $dir");
  221: 	logger('DEBUG', "mkdir $dir/pageimg"); 
  222: 	if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
  223: 	    logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg"); 
  224: 	    if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
  225: 		if (-d "$dir/pageimg") {
  226: 		    logger('DEBUG', "directory $dir OK"); 
  227: 		    add_digilib_mapping($src_dir, "$map_dir/pageimg");
  228: 		    if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
  229: 			logger('DEBUG', "directory $digifiles_base/$src_dir removed"); 
  230: 			return $dir;
  231: 		    } else {
  232: 			logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
  233: 			$errcnt++;
  234: 			return $dir;
  235: 		    }
  236: 		}
  237: 	    }
  238: 	}
  239: 	logger('ABORT', "unable to copy directory $src_dir to $dir!");
  240: 	exit 1;
  241:     }
  242:     return;
  243: }
  244: 
  245: sub find_cw_dir {
  246:     my ($input_node) = @_;
  247:     my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
  248:     my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
  249:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  250:     if (! $dest_id) {
  251: 	logger('ERROR', "no ID field for einstein-cw entry");
  252: 	$errcnt++;
  253: 	return;
  254:     }
  255:     my $dir = "$lib_arch_dir/$dest_id";
  256:     if ($dry_run) {
  257: 	logger('DEBUG', "would move $cw_base/$src_dir to $dir");
  258: 	return $dir;
  259:     } else {
  260: 	logger('DEBUG', "moving $cw_base/$src_dir to $dir");
  261: 	if (rename "$cw_base/$src_dir", $dir) {
  262: 	    if (-d $dir) {
  263: 		logger('DEBUG', "directory $dir OK"); 
  264: 		return $dir;
  265: 	    }
  266: 	} else {
  267: 	    logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
  268: 	    exit 1;
  269: 	}
  270:     }
  271:     return;
  272: }
  273: 
  274: sub find_permanent_dir {
  275:     my ($input_node) = @_;
  276:     my $online_base = '/mpiwg/online/permanent';
  277:     my $src_dir = find_online_dir($input_node, $online_base, 'pageimg');
  278:     my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
  279:     if (! $dest_id) {
  280: 	logger('ERROR', "no ID field for online permanent entry");
  281: 	$errcnt++;
  282: 	return;
  283:     }
  284:     my $dir = "$online_base/$src_dir";
  285:     return $dir;
  286: }
  287: 
  288: #
  289: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
  290: #
  291: # Takes the path from the $online_url_field of the $input_node document
  292: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
  293: # Returns the directory path sans $base_dir if it exists
  294: #
  295: sub find_online_dir {
  296:     my ($input_node, $base_dir, $page_dir) = @_;
  297:     $base_dir = $lib_online_dir unless ($base_dir);
  298: 
  299:     my $online_url = $input_node->findvalue("fm:$online_url_field");
  300:     logger('DEBUG', "checking URL: $online_url");
  301:     my $online_dir;
  302:     if ($online_url =~ /fn=permanent\/(.+)/) {
  303: 	# new style digilib URL
  304: 	$online_dir = $1;
  305:     } elsif ($online_url =~ /\?([^\+]+)\+/) {
  306: 	# old style digilib URL
  307: 	$online_dir = $1;
  308:     }
  309:     #logger('DEBUG', "online_dir1: $online_dir");
  310:     if ($online_dir) {
  311: 	$online_dir =~ s/\/$//; # strip ending slashes
  312: 	if ($page_dir) {
  313: 	  $online_dir =~ s/\/${page_dir}$//;
  314: 	}
  315: 	#logger("DEBUG", "dir: $base_dir/$online_dir");
  316: 	if (-d "$base_dir/$online_dir") {
  317: 	    logger('DEBUG', "directory $base_dir/$online_dir exists");
  318: 	    return $online_dir;
  319: 	}
  320:     }
  321:     return;
  322: }
  323: 
  324: sub find_arch_dir {
  325:     my ($input_node) = @_;
  326:     my $dir = "";
  327: 
  328:     my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
  329:     #logger('DEBUG', "bibdir: $bib_dir");
  330:     if ($bib_dir) {
  331: 	$dir = "$lib_arch_dir/$bib_dir";
  332: 	if (-d $dir) {
  333: 	    logger('DEBUG', "directory $dir exists"); 
  334: 	    return $dir;
  335: 	}
  336:     }
  337:     return;
  338: }
  339: 
  340: 
  341: sub convert_bib {
  342:     my ($input_node, $index_root, $index_doc) = @_;
  343:     my $cnt = 0;
  344:     my $type = "";
  345:     my $type_path = "";
  346: 
  347:     # process general stuff first
  348:     foreach my $n ($input_node->getChildNodes()) {
  349: 	my $name = $n->nodeName();
  350: 	my $val = $n->textContent();
  351: 	#logger('DEBUG', "  NODE: $name = '$val'");
  352: 	if (exists $gen_map{$name}) {
  353: 	    # is a general field
  354: 	    if ($name eq $lang_field) {
  355: 		# language field -> convert to iso code
  356: 		if (exists $lang_map{$val}) {
  357: 		    $val = $lang_map{$val};
  358: 		} else {
  359: 		    logger('ERROR', "unknown language: $val! skipping...");
  360: 		    $errcnt++;
  361: 		    return 0;
  362: 		}
  363: 	    }
  364: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  365: 		->appendTextNode($val);
  366: 	    $cnt++;
  367: 	} elsif (exists $type_map{$name}) {
  368: 	    # is a type field
  369: 	    $type_path = $type_map{$name};
  370: 	    $type = $val;
  371: 	    # check with known types
  372: 	    if (exists $subtype_map{$val}) {
  373: 		my $indextype = $subtype_map{$val}->{'_name'};
  374: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  375: 		$cnt++;
  376: 	    } else {
  377: 		logger('ERROR', 'unknown bib type $val! skipping...');
  378: 		$errcnt++;
  379: 		return 0;
  380: 	    }
  381: 	}
  382:     }
  383:     # process sub type fields
  384:     if ($type) {
  385: 	foreach my $n ($input_node->getChildNodes()) {
  386: 	    my $name = $n->nodeName();
  387: 	    my $val = $n->textContent();
  388: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  389: 	    if (exists $subtype_map{$type}->{$name}) {
  390: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  391: 		    ->appendTextNode($val);
  392: 		$cnt++;
  393: 	    }
  394: 	}
  395:     }
  396:     return $cnt;
  397: }
  398: 
  399: 
  400: 
  401: sub process_all_fm_entries {
  402:     my ($input_root) = @_;
  403:     my $cnt = 0;
  404: 
  405:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  406: 	logger('INFO', "processing entry $cnt ...");
  407: 	process_fm_entry($n);
  408: 	$cnt++;
  409:     }
  410: }    
  411: 
  412: 
  413: sub process_fm_entry {
  414:     my ($input_node) = @_;
  415:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  416:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  417:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  418:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  419:     $index_doc->setDocumentElement($index_root);
  420: 
  421:     # try to find the document directory
  422:     my $doc_dir = "";
  423:     if ($online_mode) {
  424: 	$doc_dir = find_permanent_dir($input_node);
  425:     } elsif ($cw_mode) {
  426: 	$doc_dir = find_cw_dir($input_node);
  427:     } elsif ($digifiles_mode) {
  428: 	$doc_dir = find_digifiles_dir($input_node);
  429:     } else {
  430: 	$doc_dir = find_arch_dir($input_node);
  431:     }
  432:     if (! $doc_dir) {
  433: 	logger('ERROR', "document directory not found! skipping...");
  434: 	$errcnt++;
  435: 	return;
  436:     }
  437: 
  438:     # add standard stuff to index.meta
  439:     my ($docname, $docpath) = split_file_path($doc_dir);
  440:     # name and date
  441:     create_text_path('name', $docname, $index_root, $namespace);
  442:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  443:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  444:     create_text_path('creator', 'digigroup', $index_root, $namespace);
  445:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  446:     # acquisition
  447:     create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  448:     create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
  449:     create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  450:     # media
  451:     create_text_path('media-type', 'image', $index_root, $namespace);
  452:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  453: 
  454:     # convert bib entries
  455:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  456:     if ($cnt == 0) {
  457: 	# error or nothing to convert
  458: 	logger('ERROR', "no bibliographic metadata!");
  459: 	$errcnt++;
  460: 	return;
  461:     }
  462: 
  463:     # write new index.meta file
  464:     if ($dry_run) {
  465: 	logger('DEBUG', "would write $doc_dir/index.meta");
  466: 	logger('DEBUG', $index_doc->toString(1));
  467:     } else {
  468: 	write_xml($index_doc, "$doc_dir/index.meta");
  469:     }
  470: 
  471: }
  472: 
  473: 
  474: 
  475: 
  476: 
  477: #######################################################
  478: # Main
  479: #
  480: 
  481: # load filemaker xml dump
  482: my ($input_doc, $input_root) = read_xml($infile);
  483: # set namespace prefix
  484: my $fm_namespace = $input_root->namespaceURI();
  485: $input_root->setNamespace($fm_namespace, 'fm', 1);
  486: 
  487: # create digilib mapping file for digifiles mode
  488: if ($digifiles_mode) {
  489:     $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  490:     $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
  491:     $mapping_doc->setDocumentElement($mapping_root);
  492: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
  493: 
  494: }
  495: 
  496: process_all_fm_entries($input_root);
  497: 
  498: 
  499: logger("INFO", "$warncnt warnings");
  500: logger("INFO", "$errcnt errors");
  501: if ($errcnt > 0) {
  502:     logger("ABORT", "there were errors!");
  503:     exit 1;
  504: } else {
  505:     logger("DONE", "done something successfully!");
  506: }
  507: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>