Mercurial > hg > foxridge-archiver
view makemeta-quantum.pl @ 60:5bee75ca9eb3 default tip
added old makemeta-quantum.pl that was not in CVS.
author | casties |
---|---|
date | Thu, 16 Mar 2017 18:29:58 +0100 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w use strict; use XML::LibXML; use lib '/usr/local/mpiwg/archive'; use MPIWGStor; # make output unbuffered $|=1; # program version my $version = "0.1.2 (24.7.2008 ROC)"; my $help = "use: makemeta-quantum [options] file.xml options: -debug show debugging info -dry-run simulate, dont'do anything -online-mode mode for creating online/permanent files -online-base=dir base directory for online ids (for online mode) -access=free adds free access tag (use access=mpiwg for restricted access) "; logger("INFO", "makemeta-quantum $version"); ########################################### # mappings # generic mappings at top level my %gen_map = ( 'Language' => 'meta/lang' ); # sub type switch tag my %type_map = ( 'Index_Meta_Type' => 'meta/bib@type' ); # sub type mappings # Correspondence (748) # Other Archival Material (4) # Published (11) # Published Document (10) # Unpublished Writing (1373) my %subtype_map = ( 'Correspondence' => { '_name' => 'correspondence', 'Document_Subtype' => 'meta/bib/type', 'Author' => 'meta/bib/author', 'Person_to' => 'meta/bib/recipient', 'Title' => 'meta/bib/title', 'Date_range_from' => 'meta/bib/date', 'Date_range_to' => 'meta/bib/date-range-end', 'Date_original_form' => 'meta/bib/date-original', 'Place_From' => 'meta/bib/place', 'First_lines' => 'meta/bib/incipit', 'Call_number' => 'meta/bib/signature', 'Call_number_original' => 'meta/bib/call-number', 'Holding_institution' => 'meta/bib/holding-library', 'Keywords' => 'meta/bib/description' }, 'Published' => { '_name' => 'manuscript', 'Document_Subtype' => 'meta/bib/type', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Date_range_from' => 'meta/bib/date', 'Date_range_to' => 'meta/bib/date-range-end', 'Date_original_form' => 'meta/bib/date-original', 'Place_From' => 'meta/bib/location', 'First_lines' => 'meta/bib/incipit', 'Call_number' => 'meta/bib/signature', 'Call_number_original' => 'meta/bib/call-number', 'Holding_institution' => 'meta/bib/holding-library', 'Keywords' => 'meta/bib/description' }, 'Published Document' => { '_name' => 'manuscript', 'Document_Subtype' => 'meta/bib/type', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Date_range_from' => 'meta/bib/date', 'Date_range_to' => 'meta/bib/date-range-end', 'Date_original_form' => 'meta/bib/date-original', 'Place_From' => 'meta/bib/location', 'First_lines' => 'meta/bib/incipit', 'Call_number' => 'meta/bib/signature', 'Call_number_original' => 'meta/bib/call-number', 'Holding_institution' => 'meta/bib/holding-library', 'Keywords' => 'meta/bib/description' }, 'Unpublished Writing' => { '_name' => 'manuscript', 'Document_Subtype' => 'meta/bib/type', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Date_range_from' => 'meta/bib/date', 'Date_range_to' => 'meta/bib/date-range-end', 'Date_original_form' => 'meta/bib/date-original', 'Place_From' => 'meta/bib/location', 'First_lines' => 'meta/bib/incipit', 'Call_number' => 'meta/bib/signature', 'Call_number_original' => 'meta/bib/call-number', 'Holding_institution' => 'meta/bib/holding-library', 'Keywords' => 'meta/bib/description' }, ); # language element my $lang_field = 'Language'; # languages to iso codes my %lang_map = ( 'German' => 'de', 'English' => 'en', 'Italian' => 'it', 'French' => 'fr', 'Latin' => 'la', 'Japanese' => 'ja', 'Dutch' => 'nl', 'Spanish' => 'es', 'Swedish' => 'sv' ); # storage fields my $online_url_field = 'URL'; my $online_path_field = 'Path_images'; my $id_field = 'ID'; ####################################################### # internal parameters # # storage my $lib_online_dir = '/mpiwg/online'; my $lib_digilib_path = 'permanent'; # read command line parameters my $args = MPIWGStor::parseargs; if (! scalar(%$args)) { print $help, "\n"; exit 1; } # debug level $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; # simulate action only my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; logger('DEBUG', "dry-run: $dry_run"); # use online mode my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; logger('DEBUG', "online_mode: $online_mode"); # online base dir my $online_base_dir = (exists $$args{'online-base'}) ? $$args{'online-base'} : "/mpiwg/online"; logger('DEBUG', "online_base_dir: $online_base_dir"); # create texttool tag my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; logger('DEBUG', "texttool: $texttool"); # image dir for texttool my $texttool_img_dir = "pageimg"; # fulltext directory pattern for texttool my $texttool_fulltext_glob = "fulltext-*/*.xml"; # pagebreak tag for texttool my $texttool_pb_tag = "pb"; # xslt for texttool my $texttool_xslt = "/mpiwg/online/permanent/echo/quantum_project/hr-ms/schlick_correspondence.xsl"; # digiliburlprefix my $texttool_dlurlprefix = "http://echo.mpiwg-berlin.mpg.de/zogilib?"; # access type my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; # index.meta namespace (not really implemented!) my $namespace = ""; my $xml_changed = 0; my $errcnt = 0; my $warncnt = 0; my $filecnt = 0; ####################################################### # check parameters that were passed to the program # my $infile = $$args{'path'}; if (! $infile) { logger("ABORT", "no input file given!"); exit 1; } # strip double slashes $infile = sstrip($infile, 1); if (! -f $infile) { logger("ABORT", "input file \'$infile\' doesn't exist!"); exit 1; } ####################################################### # subroutines # sub find_online_path { my ($input_node) = @_; my $online_path = sstrip($input_node->findvalue("fm:$online_path_field")); my $id = sstrip($input_node->findvalue("fm:$id_field")); # try online_base_dir + online_path first if (($online_base_dir)&&($online_path)) { my $dir = sstrip("$online_base_dir/$online_path", 1); $dir =~ s/\/index.meta//; if ( -d $dir ) { return $dir; } else { logger('ERROR', "online path '$dir' not found! ($id)"); $errcnt++; return; } } logger('ERROR', "online path not found! ($id)"); $errcnt++; return; } # # $dir = find_online_dir($input_node, $base_dir, $page_dir) # # Takes the path from the $online_url_field of the $input_node document # and looks in the directory $base_dir for it. Strips $page_dir from the end. # Returns the directory path sans $base_dir if it exists # sub find_online_dir { my ($input_node, $base_dir, $page_dir) = @_; $base_dir = $lib_online_dir unless ($base_dir); my $online_url = $input_node->findvalue("fm:$online_url_field"); logger('DEBUG', "checking URL: $online_url"); my $online_dir; if ($online_url =~ /fn=permanent\/(.+)/) { # new style digilib URL $online_dir = $1; } elsif ($online_url =~ /\?([^\+]+)\+/) { # old style digilib URL $online_dir = $1; } #logger('DEBUG', "online_dir1: $online_dir"); if ($online_dir) { $online_dir =~ s/\/$//; # strip ending slashes if ($page_dir) { # strip page_dir $online_dir =~ s/\/${page_dir}$//; } #logger("DEBUG", "dir: $base_dir/$online_dir"); if (-d "$base_dir/$online_dir") { logger('DEBUG', "directory $base_dir/$online_dir exists"); return $online_dir; } } return; } sub convert_bib { my ($input_node, $index_root, $index_doc) = @_; my $cnt = 0; my $type = ""; my $type_path = ""; # process general stuff first foreach my $n ($input_node->getChildNodes()) { my $name = $n->nodeName(); my $val = $n->textContent(); #logger('DEBUG', " NODE: $name = '$val'"); if (exists $gen_map{$name}) { # is a general field if ($name eq $lang_field) { # language field -> convert to iso code if (exists $lang_map{$val}) { $val = $lang_map{$val}; } else { logger('WARNING', "unknown language: $val! ignoring..."); $warncnt++; next; } } create_element_path($gen_map{$name}, $index_root, $namespace) ->appendTextNode($val); $cnt++; } elsif (exists $type_map{$name}) { # is a type field $type_path = $type_map{$name}; $type = $val; # check with known types if (exists $subtype_map{$val}) { my $indextype = $subtype_map{$val}->{'_name'}; create_element_path("$type_path=$indextype", $index_root, $namespace); $cnt++; } else { logger('ERROR', 'unknown bib type $val! skipping...'); $errcnt++; return 0; } } } # process sub type fields if ($type) { foreach my $n ($input_node->getChildNodes()) { my $name = $n->nodeName(); my $val = $n->textContent(); #logger('DEBUG', " NODE: $name = '$val'"); if (exists $subtype_map{$type}->{$name}) { create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) ->appendTextNode($val); $cnt++; } } } return $cnt; } sub process_all_fm_entries { my ($input_root) = @_; my $cnt = 0; foreach my $n ($input_root->findnodes('fm:ROW')) { logger('INFO', "processing entry $cnt ..."); process_fm_entry($n); $cnt++; } } sub process_fm_entry { my ($input_node) = @_; my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); my $index_root = $index_doc->createElementNS($namespace, 'resource'); $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); $index_doc->setDocumentElement($index_root); my $derived_from = ""; # try to find the document directory my $doc_dir = ""; if ($online_mode) { $doc_dir = find_online_path($input_node); #$derived_from = find_arch_dir($input_node); } if (! (($doc_dir) && (-d $doc_dir))) { logger('ERROR', "document directory not found! skipping..."); $errcnt++; return; } # add standard stuff to index.meta my ($docname, $docpath) = split_file_path($doc_dir); # name and date create_text_path('name', $docname, $index_root, $namespace); create_text_path('archive-path', $doc_dir, $index_root, $namespace); create_text_path('archive-creation-date', stime(time), $index_root, $namespace); create_text_path('creator', 'quantum-history group', $index_root, $namespace); create_text_path('description', 'a scanned document', $index_root, $namespace); # media create_text_path('media-type', 'image', $index_root, $namespace); create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); # derived-from if ($derived_from) { create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace); } # access if ($access_type) { if ($access_type eq "free") { create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); } else { my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); create_text_path('name', $access_type, $acc_tag, $namespace); } } # texttool tag with image dir if ($texttool) { create_text_path('meta/texttool/display', 'yes', $index_root, $namespace); if ( -d "$doc_dir/$texttool_img_dir" ) { create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); } else { logger('WARNING', "page image directory missing!"); $warncnt++; } # check for fulltext my @ftds = glob "$doc_dir/$texttool_fulltext_glob"; if (@ftds) { @ftds = sort @ftds; my $ftd = $ftds[$#ftds]; create_text_path('meta/texttool/text', $ftd, $index_root, $namespace); if (scalar @ftds > 1) { logger('WARNING', "more than one fulltext! Chose ${ftd}."); $warncnt++; } create_text_path('meta/texttool/pagebreak', $texttool_pb_tag, $index_root, $namespace); create_text_path('meta/texttool/xslt', $texttool_xslt, $index_root, $namespace); #create_text_path('meta/texttool/digiliburlprefix', $texttool_dlurlprefix, $index_root, $namespace); } } # convert bib entries my $cnt = convert_bib($input_node, $index_root, $index_doc); if ($cnt == 0) { # error or nothing to convert logger('ERROR', "no bibliographic metadata!"); $errcnt++; return; } # write new index.meta file $filecnt++; if ($dry_run) { logger('DEBUG', "would write $doc_dir/index.meta"); logger('DEBUG', $index_doc->toString(1)); } else { write_xml($index_doc, "$doc_dir/index.meta"); } } ####################################################### # Main # # load filemaker xml dump my ($input_doc, $input_root) = read_xml($infile); # set namespace prefix my $fm_namespace = $input_root->namespaceURI(); $input_root->setNamespace($fm_namespace, 'fm', 1); process_all_fm_entries($input_root); logger("INFO", "$filecnt files written"); logger("INFO", "$warncnt warnings"); logger("INFO", "$errcnt errors"); if ($errcnt > 0) { logger("ABORT", "there were errors!"); exit 1; } else { logger("DONE", "done something successfully!"); }