File:  [Repository] / foxridge-archiver / makemeta-lib.pl
Revision 1.12: download - view: text, annotated - select for diffs - revision graph
Mon Dec 11 18:07:43 2006 UTC (17 years, 6 months ago) by casties
Branches: MAIN
CVS tags: HEAD
fixed wrong placement of access-conditions tag

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::LibXML;
    5: 
    6: use lib '/usr/local/mpiwg/archive';
    7: use MPIWGStor;
    8: 
    9: # make output unbuffered
   10: $|=1;
   11: 
   12: # program version
   13: my $version = "0.2.6 (11.12.2006 ROC)";
   14: my $help = 
   15: "use: makemeta-lib [options] file.xml
   16: options:
   17:   -debug  show debugging info
   18:   -dry-run  simulate, dont'do anything
   19:   -online-mode  mode for creating online/permanent files
   20:   -online-base=dir  base directory for online ids (for online mode)
   21:   -cw-mode  mode for copying einstein_cw archive documents
   22:   -digifiles-mode  mode for copying files from digifiles
   23:   -map-file=mapfile.xml  digilib mapping file (for digifiles mode)
   24:   -access=free  adds free access tag (use access=mpiwg for restricted access)
   25: ";
   26: logger("INFO", "makemeta-lib $version");
   27: 
   28: ###########################################
   29: # mappings
   30: 
   31: # generic mappings at top level
   32: my %gen_map = (
   33:     'Device' => 'meta/image-acquisition/device',
   34:     'Image_Type' => 'meta/image-acquisition/image-type',
   35:     'Production_Comment' => 'meta/image-acquisition/production-comment',
   36:     'Postproduction' => 'meta/image-acquisition/production-comment',
   37:     'Language' => 'meta/lang'
   38:     );
   39: # sub type switch tag
   40: my %type_map = (
   41:     'Reference_Type' => 'meta/bib@type'
   42:     );
   43: # sub type mappings
   44: my %subtype_map = (
   45:     'Book' => {
   46: 	'_name' => 'book',
   47: 	'Author' => 'meta/bib/author',
   48: 	'Title' => 'meta/bib/title',
   49: 	'Year' => 'meta/bib/year',
   50: 	'Place_Published' => 'meta/bib/city',
   51: 	'Publisher' => 'meta/bib/publisher',
   52: 	'Edition' => 'meta/bib/edition'
   53:     },
   54:     'Journal Article' => {
   55: 	'_name' => 'journal-article',
   56: 	'Author' => 'meta/bib/author',
   57: 	'Title' => 'meta/bib/title',
   58: 	'Year' => 'meta/bib/year',
   59: 	'Secondary_Title' => 'meta/bib/journal',
   60: 	'Volume' => 'meta/bib/volume',
   61: 	'Number' => 'meta/bib/issue',
   62: 	'Pages' => 'meta/bib/pages'
   63:     },
   64:     'In Book' => {
   65: 	'_name' => 'inbook',
   66: 	'Author' => 'meta/bib/author',
   67: 	'Title' => 'meta/bib/title',
   68: 	'Year' => 'meta/bib/year',
   69: 	'Secondary_Title' => 'meta/bib/book-title',
   70: 	'Pages' => 'meta/bib/pages'
   71:     },
   72:     'Newspaper Article' => {
   73: 	'_name' => 'newspaper-article',
   74: 	'Author' => 'meta/bib/author',
   75: 	'Title' => 'meta/bib/title',
   76: 	'Year' => 'meta/bib/year',
   77: 	'Secondary_Title' => 'meta/bib/newspaper',
   78: 	'Place_Published' => 'meta/bib/city',
   79: 	'Number' => 'meta/bib/issue-date',
   80: 	'Pages' => 'meta/bib/pages'
   81:     },
   82:     'Edited Book' => {
   83: 	'_name' => 'edited-book',
   84: 	'Author' => 'meta/bib/editor',
   85: 	'Title' => 'meta/bib/title',
   86: 	'Year' => 'meta/bib/year',
   87: 	'Place_Published' => 'meta/bib/city',
   88: 	'Publisher' => 'meta/bib/publisher',
   89: 	'Edition' => 'meta/bib/edition'
   90:     },
   91:     'Manuscript' => {
   92: 	'_name' => 'manuscript',
   93: 	'Author' => 'meta/bib/author',
   94: 	'Title' => 'meta/bib/title',
   95: 	'Year' => 'meta/bib/year',
   96: 	'Place_Published' => 'meta/bib/location',
   97:     }
   98:     );
   99: # language element
  100: my $lang_field = 'Language';
  101: # languages to iso codes
  102: my %lang_map = (
  103:     'German' => 'de',
  104:     'English' => 'en',
  105:     'Italian' => 'it',
  106:     'French' => 'fr',
  107:     'Latin' => 'la',
  108:     'Japanese' => 'ja',
  109:     'Dutch' => 'nl',
  110:     'Spanish' => 'es',
  111:     'Swedish' => 'sv'
  112:     );
  113: # storage fields
  114: my $arch_id_field = 'ID_Archive';
  115: my $online_url_field = 'URL';
  116: my $online_id_field = 'ID_OnlinePermanent';
  117: 
  118: #######################################################
  119: # internal parameters
  120: #
  121: 
  122: # storage
  123: my $lib_arch_dir = '/mpiwg/archive/data/library';
  124: my $lib_online_dir = '/mpiwg/online/permanent';
  125: my $lib_digilib_path = 'permanent';
  126: my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid";
  127: 
  128: # read command line parameters
  129: my $args = MPIWGStor::parseargs;
  130: if (! scalar(%$args)) {
  131:     print $help, "\n";
  132:     exit 1;
  133: }
  134: 
  135: # debug level
  136: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
  137: 
  138: # simulate action only
  139: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
  140: logger('DEBUG', "dry-run: $dry_run");
  141: 
  142: # use online mode
  143: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
  144: logger('DEBUG', "online_mode: $online_mode");
  145: # online base dir
  146: my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "";
  147: logger('DEBUG', "online_base_dir: $online_base_dir");
  148: 
  149: # use einstein-cw mode
  150: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
  151: logger('DEBUG', "cw_mode: $cw_mode");
  152: 
  153: # use digifiles mode
  154: my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0;
  155: logger('DEBUG', "digifiles_mode: $digifiles_mode");
  156: # digilib mapping file
  157: my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : "";
  158: logger('DEBUG', "map_file_name: $map_file_name");
  159: my $mapping_doc;
  160: my $mapping_root;
  161: 
  162: # access type
  163: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
  164: 
  165: # index.meta namespace (not really implemented!)
  166: my $namespace = "";
  167: 
  168: 
  169: my $xml_changed = 0;
  170: my $errcnt = 0;
  171: my $warncnt = 0;
  172: 
  173: #######################################################
  174: # check parameters that were passed to the program
  175: #
  176: my $infile = $$args{'path'};
  177: if (! $infile) {
  178:     logger("ABORT", "no input file given!");
  179:     exit 1;
  180: }
  181: # strip double slashes
  182: $infile = sstrip($infile, 1);
  183: if (! -f $infile) {
  184:     logger("ABORT", "input file \'$infile\' doesn't exist!");
  185:     exit 1;
  186: }
  187: 
  188: 
  189: #######################################################
  190: # subroutines
  191: #
  192: 
  193: 
  194: sub add_digilib_mapping {
  195:     my ($src_dir, $dest_dir) = @_;
  196:     my $elem = $mapping_root->addNewChild($namespace, 'mapping');
  197:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir));
  198:     $elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir));
  199:     if ($map_file_name) {
  200: 	write_xml($mapping_doc, $map_file_name);
  201:     } else {
  202: 	logger('ABORT', "unable to write mapping file!");
  203: 	exit 1;
  204:     }
  205: }
  206: 
  207: sub find_digifiles_dir {
  208:     my ($input_node) = @_;
  209:     my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid';
  210:     my $src_dir = find_online_dir($input_node, $digifiles_base, '');
  211:     if (! $src_dir) {
  212: 	logger('ERROR', "no online directory for digifiles entry");
  213: 	$errcnt++;
  214: 	return;
  215:     }
  216:     my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field"));
  217:     if (! $dest_id) {
  218: 	logger('ERROR', "no ID field for digifiles entry");
  219: 	$errcnt++;
  220: 	return;
  221:     }
  222:     my $dir = "$lib_online_dir/library/$dest_id";
  223:     my $map_dir = "$lib_digilib_path/library/$dest_id";
  224:     if ($dry_run) {
  225: 	logger('DEBUG', "would move $digifiles_base/$src_dir to $dir");
  226: 	add_digilib_mapping($src_dir, "$map_dir/pageimg");
  227: 	return $dir;
  228:     } else {
  229: 	logger('INFO', "moving $digifiles_base/$src_dir to $dir");
  230: 	logger('DEBUG', "mkdir $dir/pageimg"); 
  231: 	if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) {
  232: 	    logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg"); 
  233: 	    if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) {
  234: 		if (-d "$dir/pageimg") {
  235: 		    logger('DEBUG', "directory $dir OK"); 
  236: 		    add_digilib_mapping($src_dir, "$map_dir/pageimg");
  237: 		    if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) {
  238: 			logger('DEBUG', "directory $digifiles_base/$src_dir removed"); 
  239: 			return $dir;
  240: 		    } else {
  241: 			logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!");
  242: 			$errcnt++;
  243: 			return $dir;
  244: 		    }
  245: 		}
  246: 	    }
  247: 	}
  248: 	logger('ABORT', "unable to copy directory $src_dir to $dir!");
  249: 	exit 1;
  250:     }
  251:     return;
  252: }
  253: 
  254: sub find_cw_dir {
  255:     my ($input_node) = @_;
  256:     my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup';
  257:     my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg');
  258:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
  259:     if (! $dest_id) {
  260: 	logger('ERROR', "no ID field for einstein-cw entry");
  261: 	$errcnt++;
  262: 	return;
  263:     }
  264:     my $dir = "$lib_arch_dir/$dest_id";
  265:     if ($dry_run) {
  266: 	logger('DEBUG', "would move $cw_base/$src_dir to $dir");
  267: 	return $dir;
  268:     } else {
  269: 	logger('DEBUG', "moving $cw_base/$src_dir to $dir");
  270: 	if (rename "$cw_base/$src_dir", $dir) {
  271: 	    if (-d $dir) {
  272: 		logger('DEBUG', "directory $dir OK"); 
  273: 		return $dir;
  274: 	    }
  275: 	} else {
  276: 	    logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!");
  277: 	    exit 1;
  278: 	}
  279:     }
  280:     return;
  281: }
  282: 
  283: sub find_permanent_dir {
  284:     my ($input_node) = @_;
  285:     my $online_id = sstrip($input_node->findvalue("fm:$online_id_field"));
  286:     # try online_base_dir + online_id first
  287:     if (($online_base_dir)&&($online_id)) {
  288: 	my $dir = sstrip("$online_base_dir/$online_id", 1);
  289: 	return $dir;
  290:     }
  291:     # then online_url
  292:     my $online_base = '/mpiwg/online/permanent';
  293:     my $online_dir = find_online_dir($input_node, $online_base, 'pageimg');
  294:     if ((! $online_dir)) {
  295: 	logger('ERROR', "no ID or URL for online permanent entry");
  296: 	$errcnt++;
  297: 	return;
  298:     }
  299:     my $dir = sstrip("$online_base/$online_dir", 1);
  300:     return $dir;
  301: }
  302: 
  303: #
  304: # $dir = find_online_dir($input_node, $base_dir, $page_dir)
  305: #
  306: # Takes the path from the $online_url_field of the $input_node document
  307: # and looks in the directory $base_dir for it. Strips $page_dir from the end.
  308: # Returns the directory path sans $base_dir if it exists
  309: #
  310: sub find_online_dir {
  311:     my ($input_node, $base_dir, $page_dir) = @_;
  312:     $base_dir = $lib_online_dir unless ($base_dir);
  313: 
  314:     my $online_url = $input_node->findvalue("fm:$online_url_field");
  315:     logger('DEBUG', "checking URL: $online_url");
  316:     my $online_dir;
  317:     if ($online_url =~ /fn=permanent\/(.+)/) {
  318: 	# new style digilib URL
  319: 	$online_dir = $1;
  320:     } elsif ($online_url =~ /\?([^\+]+)\+/) {
  321: 	# old style digilib URL
  322: 	$online_dir = $1;
  323:     }
  324:     #logger('DEBUG', "online_dir1: $online_dir");
  325:     if ($online_dir) {
  326: 	$online_dir =~ s/\/$//; # strip ending slashes
  327: 	if ($page_dir) {
  328: 	    # strip page_dir
  329: 	    $online_dir =~ s/\/${page_dir}$//;
  330: 	}
  331: 	#logger("DEBUG", "dir: $base_dir/$online_dir");
  332: 	if (-d "$base_dir/$online_dir") {
  333: 	    logger('DEBUG', "directory $base_dir/$online_dir exists");
  334: 	    return $online_dir;
  335: 	}
  336:     }
  337:     return;
  338: }
  339: 
  340: sub find_arch_dir {
  341:     my ($input_node) = @_;
  342:     my $dir = "";
  343: 
  344:     my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
  345:     #logger('DEBUG', "bibdir: $bib_dir");
  346:     if ($bib_dir) {
  347: 	$dir = "$lib_arch_dir/$bib_dir";
  348: 	if (-d $dir) {
  349: 	    logger('DEBUG', "directory $dir exists"); 
  350: 	    return $dir;
  351: 	}
  352:     }
  353:     return;
  354: }
  355: 
  356: 
  357: sub convert_bib {
  358:     my ($input_node, $index_root, $index_doc) = @_;
  359:     my $cnt = 0;
  360:     my $type = "";
  361:     my $type_path = "";
  362: 
  363:     # process general stuff first
  364:     foreach my $n ($input_node->getChildNodes()) {
  365: 	my $name = $n->nodeName();
  366: 	my $val = $n->textContent();
  367: 	#logger('DEBUG', "  NODE: $name = '$val'");
  368: 	if (exists $gen_map{$name}) {
  369: 	    # is a general field
  370: 	    if ($name eq $lang_field) {
  371: 		# language field -> convert to iso code
  372: 		if (exists $lang_map{$val}) {
  373: 		    $val = $lang_map{$val};
  374: 		} else {
  375: 		    logger('ERROR', "unknown language: $val! skipping...");
  376: 		    $errcnt++;
  377: 		    return 0;
  378: 		}
  379: 	    }
  380: 	    create_element_path($gen_map{$name}, $index_root, $namespace)
  381: 		->appendTextNode($val);
  382: 	    $cnt++;
  383: 	} elsif (exists $type_map{$name}) {
  384: 	    # is a type field
  385: 	    $type_path = $type_map{$name};
  386: 	    $type = $val;
  387: 	    # check with known types
  388: 	    if (exists $subtype_map{$val}) {
  389: 		my $indextype = $subtype_map{$val}->{'_name'};
  390: 		create_element_path("$type_path=$indextype", $index_root, $namespace);
  391: 		$cnt++;
  392: 	    } else {
  393: 		logger('ERROR', 'unknown bib type $val! skipping...');
  394: 		$errcnt++;
  395: 		return 0;
  396: 	    }
  397: 	}
  398:     }
  399:     # process sub type fields
  400:     if ($type) {
  401: 	foreach my $n ($input_node->getChildNodes()) {
  402: 	    my $name = $n->nodeName();
  403: 	    my $val = $n->textContent();
  404: 	    #logger('DEBUG', "  NODE: $name = '$val'");
  405: 	    if (exists $subtype_map{$type}->{$name}) {
  406: 		create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
  407: 		    ->appendTextNode($val);
  408: 		$cnt++;
  409: 	    }
  410: 	}
  411:     }
  412:     return $cnt;
  413: }
  414: 
  415: 
  416: 
  417: sub process_all_fm_entries {
  418:     my ($input_root) = @_;
  419:     my $cnt = 0;
  420: 
  421:     foreach my $n ($input_root->findnodes('fm:ROW')) {
  422: 	logger('INFO', "processing entry $cnt ...");
  423: 	process_fm_entry($n);
  424: 	$cnt++;
  425:     }
  426: }    
  427: 
  428: 
  429: sub process_fm_entry {
  430:     my ($input_node) = @_;
  431:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  432:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
  433:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
  434:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
  435:     $index_doc->setDocumentElement($index_root);
  436:     my $derived_from = "";
  437: 
  438:     # try to find the document directory
  439:     my $doc_dir = "";
  440:     if ($online_mode) {
  441: 	$doc_dir = find_permanent_dir($input_node);
  442: 	$derived_from = find_arch_dir($input_node);
  443:     } elsif ($cw_mode) {
  444: 	$doc_dir = find_cw_dir($input_node);
  445:     } elsif ($digifiles_mode) {
  446: 	$doc_dir = find_digifiles_dir($input_node);
  447:     } else {
  448: 	$doc_dir = find_arch_dir($input_node);
  449:     }
  450:     if (! $doc_dir) {
  451: 	logger('ERROR', "document directory not found! skipping...");
  452: 	$errcnt++;
  453: 	return;
  454:     }
  455: 
  456:     # add standard stuff to index.meta
  457:     my ($docname, $docpath) = split_file_path($doc_dir);
  458:     # name and date
  459:     create_text_path('name', $docname, $index_root, $namespace);
  460:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
  461:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
  462:     create_text_path('creator', 'digigroup', $index_root, $namespace);
  463:     create_text_path('description', 'a scanned document', $index_root, $namespace);
  464:     # acquisition
  465:     create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
  466:     create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
  467:     create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
  468:     # media
  469:     create_text_path('media-type', 'image', $index_root, $namespace);
  470:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
  471:     # derived-from
  472:     if ($derived_from) {
  473:        create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace);
  474:     }
  475:     # access
  476:     if ($access_type) {
  477: 	if ($access_type eq "free") {
  478: 	    create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
  479: 	} else {
  480: 	    my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
  481: 	    create_text_path('name', $access_type, $acc_tag, $namespace);
  482: 	}
  483:     }
  484: 
  485:     # convert bib entries
  486:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
  487:     if ($cnt == 0) {
  488: 	# error or nothing to convert
  489: 	logger('ERROR', "no bibliographic metadata!");
  490: 	$errcnt++;
  491: 	return;
  492:     }
  493: 
  494:     # write new index.meta file
  495:     if ($dry_run) {
  496: 	logger('DEBUG', "would write $doc_dir/index.meta");
  497: 	logger('DEBUG', $index_doc->toString(1));
  498:     } else {
  499: 	write_xml($index_doc, "$doc_dir/index.meta");
  500:     }
  501: 
  502: }
  503: 
  504: 
  505: 
  506: 
  507: 
  508: #######################################################
  509: # Main
  510: #
  511: 
  512: # load filemaker xml dump
  513: my ($input_doc, $input_root) = read_xml($infile);
  514: # set namespace prefix
  515: my $fm_namespace = $input_root->namespaceURI();
  516: $input_root->setNamespace($fm_namespace, 'fm', 1);
  517: 
  518: # create digilib mapping file for digifiles mode
  519: if ($digifiles_mode) {
  520:     $mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
  521:     $mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases');
  522:     $mapping_doc->setDocumentElement($mapping_root);
  523: #<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/>
  524: 
  525: }
  526: 
  527: process_all_fm_entries($input_root);
  528: 
  529: 
  530: logger("INFO", "$warncnt warnings");
  531: logger("INFO", "$errcnt errors");
  532: if ($errcnt > 0) {
  533:     logger("ABORT", "there were errors!");
  534:     exit 1;
  535: } else {
  536:     logger("DONE", "done something successfully!");
  537: }
  538: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>