File:  [Repository] / foxridge-archiver / makemeta-lib.pl
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Tue Aug 23 15:01:08 2005 UTC (18 years, 10 months ago) by casties
Branches: MAIN
CVS tags: HEAD
weiter arbeit am digifiles modus

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive_devel';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2.1 (15.8.2005)";
   14: my $help = 
   15: "use: makemeta-lib [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -cw-mode  mode for copying einstein_cq documents
   20:   -digifiles-mode  mode for copying files from digifiles
   21:   -map-file=mapfile.xml  digilib mapping file (for digifiles mode)
   22: ";
   23: logger("INFO", "makemeta-lib $version");
   24: 
   25: ###########################################
   26: # mappings
   27: 
   28: # generic mappings at top level
   29: my %gen_map = (
   30:     'Device' => 'meta/image-acquisition/device',
   31:     'Image_Type' => 'meta/image-acquisition/image-type',
   32:     'Production_Comment' => 'meta/image-acquisition/production-comment',
   33:     'Postproduction' => 'meta/image-acquisition/production-comment',
   34:     'Language' => 'meta/lang'
   35:     );
   36: # sub type switch tag
   37: my %type_map = (
   38:     'Reference_Type' => 'meta/bib@type'
   39:     );
   40: # sub type mappings
   41: my %subtype_map = (
   42:     'Book' => {
   43: 	'_name' => 'book',
   44: 	'Author' => 'meta/bib/author',
   45: 	'Title' => 'meta/bib/title',
   46: 	'Year' => 'meta/bib/year',
   47: 	'Place_Published' => 'meta/bib/city',
   48: 	'Publisher' => 'meta/bib/publisher',
   49: 	'Edition' => 'meta/bib/edition'
   50:     },
   51:     'Journal Article' => {
   52: 	'_name' => 'journal-article',
   53: 	'Author' => 'meta/bib/author',
   54: 	'Title' => 'meta/bib/title',
   55: 	'Year' => 'meta/bib/year',
   56: 	'Secondary_Title' => 'meta/bib/journal',
   57: 	'Volume' => 'meta/bib/volume',
   58: 	'Number' => 'meta/bib/issue',
   59: 	'Pages' => 'meta/bib/pages'
   60:     },
   61:     'In Book' => {
   62: 	'_name' => 'inbook',
   63: 	'Author' => 'meta/bib/author',
   64: 	'Title' => 'meta/bib/title',
   65: 	'Year' => 'meta/bib/year',
   66: 	'Secondary_Title' => 'meta/bib/book-title',
   67: 	'Pages' => 'meta/bib/pages'
   68:     },
   69:     'Newspaper Article' => {
   70: 	'_name' => 'newspaper-article',
   71: 	'Author' => 'meta/bib/author',
   72: 	'Title' => 'meta/bib/title',
   73: 	'Year' => 'meta/bib/year',
   74: 	'Secondary_Title' => 'meta/bib/newspaper',
   75: 	'Place_Published' => 'meta/bib/city',
   76: 	'Number' => 'meta/bib/issue-date',
   77: 	'Pages' => 'meta/bib/pages'
   78:     },
   79:     'Edited Book' => {
   80: 	'_name' => 'edited-book',
   81: 	'Author' => 'meta/bib/editor',
   82: 	'Title' => 'meta/bib/title',
   83: 	'Year' => 'meta/bib/year',
   84: 	'Place_Published' => 'meta/bib/city',
   85: 	'Publisher' => 'meta/bib/publisher',
   86: 	'Edition' => 'meta/bib/edition'
   87:     },
   88:     'Manuscript' => {
   89: 	'_name' => 'manuscript',
   90: 	'Author' => 'meta/bib/author',
   91: 	'Title' => 'meta/bib/title',
   92: 	'Year' => 'meta/bib/year',
   93: 	'Place_Published' => 'meta/bib/location',
   94:     }
   95:     );
   96: # language element
   97: my $lang_field = 'Language';
   98: # languages to iso codes
   99: my %lang_map = (
  100:     'German' => 'de',
  101:     'English' => 'en',
  102:     'Italian' => 'it',
  103:     'French' => 'fr',
  104:     'Latin' => 'la',
  105:     'Japanese' => 'ja',
  106:     'Dutch' => 'nl',
  107:     'Spanish' => 'es'
  108:     );
  109: # storage fields
  110: my $arch_id_field = 'ID_Archive';
  111: my $online_url_field = 'URL';
  112: my $online_id_field = 'ID_OnlinePermanent';
  113: 
  114: #######################################################
  115: # internal parameters
  116: #
  117: 
  118: # storage
  119: my $lib_arch_dir = '/mpiwg/archive/data/library';
  120: my $lib_online_dir = '/mpiwg/online/permanent';
  121: my $lib_digilib_path = 'permanent';
  122: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
  123: 
  124: # read command line parameters
  125: my $args = MPIWGStor::parseargs;
  126: if (! scalar(%$args)) {
  127:     print $help, "\n";
  128:     exit 1;
  129: }
  130: 
  131: # debug level
  132: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  133: 
  134: # simulate action only
  135: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  136: logger('DEBUG', "dry-run: $dry_run");
  137: 
  138: # use einstein-cw mode
  139: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
  140: logger('DEBUG', "cw_mode: $cw_mode");
  141: 
  142: # use digifiles mode
  143: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
  144: logger('DEBUG', "digifiles_mode: $digifiles_mode");
  145: # digilib mapping file
  146: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
  147: logger('DEBUG', "map_file_name: $map_file_name");
  148: my $mapping_doc;
  149: my $mapping_root;
  150: 
  151: # index.meta namespace (not really implemented!)
  152: my $namespace = "";
  153: 
  154: 
  155: my $xml_changed = 0;
  156: my $errcnt = 0;
  157: my $warncnt = 0;
  158: 
  159: #######################################################
  160: # check parameters that were passed to the program
  161: #
  162: my $infile = $$args{'path'};
  163: if (! $infile) {
  164:     logger("ABORT", "no input file given!");
  165:     exit 1;
  166: }
  167: # strip double slashes
  168: $infile =~ s/\/\//\//;
  169: if (! -f $infile) {
  170:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  171:     exit 1;
  172: }
  173: 
  174: 
  175: #######################################################
  176: # subroutines
  177: #
  178: 
  179: 
  180: sub add_digilib_mapping {
  181:     my ($src_dir, $dest_dir) = @_;
  182:     my $elem = $mapping_root->addNewChild($namespace, 'mapping');
  183:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
  184:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
  185:     if ($map_file_name) {
  186: 	write_xml($mapping_doc, $map_file_name);
  187:     } else {
  188: 	logger('ABORT', "unable to write mapping file!");
  189: 	exit 1;
  190:     }
  191: }
  192: 
  193: sub find_digifiles_dir {
  194:     my ($input_node) = @_;
  195:     my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
  196:     my $src_dir = find_online_dir($input_node, $digifiles_base, '');
  197:     if (! $src_dir) {
  198: 	logger('ERROR', "no online directory for digifiles entry");
  199: 	$errcnt++;
  200: 	return;
  201:     }
  202:     my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
  203:     if (! $dest_id) {
  204: 	logger('ERROR', "no ID field for digifiles entry");
  205: 	$errcnt++;
  206: 	return;
  207:     }
  208:     my $dir = "$lib_online_dir/library/$dest_id";
  209:     my $map_dir = "$lib_digilib_path/library/$dest_id";
  210:     if ($dry_run) {
  211: 	logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
  212: 	add_digilib_mapping($src_dir, "$map_dir/pageimg");
  213: 	return $dir;
  214:     } else {
  215: 	logger('INFO', "moving $digifiles_base/$src_dir to $dir");
  216: 	logger('DEBUG', "mkdir $dir/pageimg"); 
  217: 	if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
  218: 	    logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg"); 
  219: 	    if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
  220: 		if (-d "$dir/pageimg") {
  221: 		    logger('DEBUG', "directory $dir OK"); 
  222: 		    add_digilib_mapping($src_dir, "$map_dir/pageimg");
  223: 		    if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
  224: 			logger('DEBUG', "directory $digifiles_base/$src_dir removed"); 
  225: 			return $dir;
  226: 		    } else {
  227: 			logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
  228: 			$errcnt++;
  229: 			return $dir;
  230: 		    }
  231: 		}
  232: 	    }
  233: 	}
  234: 	logger('ABORT', "unable to copy directory $src_dir to $dir!");
  235: 	exit 1;
  236:     }
  237:     return;
  238: }
  239: 
  240: sub find_cw_dir {
  241:     my ($input_node) = @_;
  242:     my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
  243:     my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
  244:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  245:     if (! $dest_id) {
  246: 	logger('ERROR', "no ID field for einstein-cw entry");
  247: 	$errcnt++;
  248: 	return;
  249:     }
  250:     my $dir = "$lib_arch_dir/$dest_id";
  251:     if ($dry_run) {
  252: 	logger('DEBUG', "would move $cw_base/$src_dir to $dir");
  253: 	return $dir;
  254:     } else {
  255: 	logger('DEBUG', "moving $cw_base/$src_dir to $dir");
  256: 	if (rename "$cw_base/$src_dir", $dir) {
  257: 	    if (-d $dir) {
  258: 		logger('DEBUG', "directory $dir OK"); 
  259: 		return $dir;
  260: 	    }
  261: 	} else {
  262: 	    logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
  263: 	    exit 1;
  264: 	}
  265:     }
  266:     return;
  267: }
  268: 
  269: #
  270: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
  271: #
  272: # Takes the path from the $online_url_field of the $input_node document
  273: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
  274: # Returns the directory path sans $base_dir if it exists
  275: #
  276: sub find_online_dir {
  277:     my ($input_node, $base_dir, $page_dir) = @_;
  278:     $base_dir = $lib_online_dir unless ($base_dir);
  279: 
  280:     my $online_url = $input_node->findvalue("fm:$online_url_field");
  281:     logger('DEBUG', "checking URL: $online_url");
  282:     my $online_dir;
  283:     if ($online_url =~ /fn=permanent\/(.+)/) {
  284: 	# new style digilib URL
  285: 	$online_dir = $1;
  286:     } elsif ($online_url =~ /\?([^\+]+)\+/) {
  287: 	# old style digilib URL
  288: 	$online_dir = $1;
  289:     }
  290:     #logger('DEBUG', "online_dir1: $online_dir");
  291:     if ($online_dir) {
  292: 	if ($page_dir) {
  293: 	  $online_dir =~ s/\/${page_dir}$//;
  294: 	}
  295: 	#logger("DEBUG", "dir: $base_dir/$online_dir");
  296: 	if (-d "$base_dir/$online_dir") {
  297: 	    logger('DEBUG', "directory $base_dir/$online_dir exists"); 
  298: 	    return $online_dir;
  299: 	}
  300:     }
  301:     return;
  302: }
  303: 
  304: sub find_arch_dir {
  305:     my ($input_node) = @_;
  306:     my $dir = "";
  307: 
  308:     my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
  309:     #logger('DEBUG', "bibdir: $bib_dir");
  310:     if ($bib_dir) {
  311: 	$dir = "$lib_arch_dir/$bib_dir";
  312: 	if (-d $dir) {
  313: 	    logger('DEBUG', "directory $dir exists"); 
  314: 	    return $dir;
  315: 	}
  316:     }
  317:     return;
  318: }
  319: 
  320: 
  321: sub convert_bib {
  322:     my ($input_node, $index_root, $index_doc) = @_;
  323:     my $cnt = 0;
  324:     my $type = "";
  325:     my $type_path = "";
  326: 
  327:     # process general stuff first
  328:     foreach my $n ($input_node->getChildNodes()) {
  329: 	my $name = $n->nodeName();
  330: 	my $val = $n->textContent();
  331: 	#logger('DEBUG', "  NODE: $name = '$val'");
  332: 	if (exists $gen_map{$name}) {
  333: 	    # is a general field
  334: 	    if ($name eq $lang_field) {
  335: 		# language field -> convert to iso code
  336: 		if (exists $lang_map{$val}) {
  337: 		    $val = $lang_map{$val};
  338: 		} else {
  339: 		    logger('ERROR', "unknown language: $val! skipping...");
  340: 		    $errcnt++;
  341: 		    return 0;
  342: 		}
  343: 	    }
  344: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  345: 		->appendTextNode($val);
  346: 	    $cnt++;
  347: 	} elsif (exists $type_map{$name}) {
  348: 	    # is a type field
  349: 	    $type_path = $type_map{$name};
  350: 	    $type = $val;
  351: 	    # check with known types
  352: 	    if (exists $subtype_map{$val}) {
  353: 		my $indextype = $subtype_map{$val}->{'_name'};
  354: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  355: 		$cnt++;
  356: 	    } else {
  357: 		logger('ERROR', 'unknown bib type $val! skipping...');
  358: 		$errcnt++;
  359: 		return 0;
  360: 	    }
  361: 	}
  362:     }
  363:     # process sub type fields
  364:     if ($type) {
  365: 	foreach my $n ($input_node->getChildNodes()) {
  366: 	    my $name = $n->nodeName();
  367: 	    my $val = $n->textContent();
  368: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  369: 	    if (exists $subtype_map{$type}->{$name}) {
  370: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  371: 		    ->appendTextNode($val);
  372: 		$cnt++;
  373: 	    }
  374: 	}
  375:     }
  376:     return $cnt;
  377: }
  378: 
  379: 
  380: 
  381: sub process_all_fm_entries {
  382:     my ($input_root) = @_;
  383:     my $cnt = 0;
  384: 
  385:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  386: 	logger('INFO', "processing entry $cnt ...");
  387: 	process_fm_entry($n);
  388: 	$cnt++;
  389:     }
  390: }    
  391: 
  392: 
  393: sub process_fm_entry {
  394:     my ($input_node) = @_;
  395:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  396:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  397:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  398:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  399:     $index_doc->setDocumentElement($index_root);
  400: 
  401:     # try to find the document directory
  402:     my $doc_dir = "";
  403:     if ($cw_mode) {
  404: 	$doc_dir = find_cw_dir($input_node);
  405:     } elsif ($digifiles_mode) {
  406: 	$doc_dir = find_digifiles_dir($input_node);
  407:     } else {
  408: 	$doc_dir = find_arch_dir($input_node);
  409:     }
  410:     if (! $doc_dir) {
  411: 	logger('ERROR', "document directory not found! skipping...");
  412: 	$errcnt++;
  413: 	return;
  414:     }
  415: 
  416:     # add standard stuff to index.meta
  417:     my ($docname, $docpath) = split_file_path($doc_dir);
  418:     # name and date
  419:     create_text_path('name', $docname, $index_root, $namespace);
  420:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  421:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  422:     create_text_path('creator', 'digigroup', $index_root, $namespace);
  423:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  424:     # acquisition
  425:     create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  426:     create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
  427:     create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  428:     # media
  429:     create_text_path('media-type', 'image', $index_root, $namespace);
  430:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  431: 
  432:     # convert bib entries
  433:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  434:     if ($cnt == 0) {
  435: 	# error or nothing to convert
  436: 	logger('ERROR', "no bibliographic metadata!");
  437: 	$errcnt++;
  438: 	return;
  439:     }
  440: 
  441:     # write new index.meta file
  442:     if ($dry_run) {
  443: 	logger('DEBUG', "would write $doc_dir/index.meta");
  444: 	logger('DEBUG', $index_doc->toString(1));
  445:     } else {
  446: 	write_xml($index_doc, "$doc_dir/index.meta");
  447:     }
  448: 
  449: }
  450: 
  451: 
  452: 
  453: 
  454: 
  455: #######################################################
  456: # Main
  457: #
  458: 
  459: # load filemaker xml dump
  460: my ($input_doc, $input_root) = read_xml($infile);
  461: # set namespace prefix
  462: my $fm_namespace = $input_root->namespaceURI();
  463: $input_root->setNamespace($fm_namespace, 'fm', 1);
  464: 
  465: # create digilib mapping file for digifiles mode
  466: if ($digifiles_mode) {
  467:     $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  468:     $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
  469:     $mapping_doc->setDocumentElement($mapping_root);
  470: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
  471: 
  472: }
  473: 
  474: process_all_fm_entries($input_root);
  475: 
  476: 
  477: logger("INFO", "$warncnt warnings");
  478: logger("INFO", "$errcnt errors");
  479: if ($errcnt > 0) {
  480:     logger("ABORT", "there were errors!");
  481:     exit 1;
  482: } else {
  483:     logger("DONE", "done something successfully!");
  484: }
  485: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>