--- foxridge-archiver/makemeta-vlp.pl 2005/09/20 17:44:48 1.2 +++ foxridge-archiver/makemeta-vlp.pl 2017/03/16 17:00:43 1.10 @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/perl -w use strict; use XML::LibXML; @@ -10,7 +10,7 @@ use MPIWGStor; $|=1; # program version -my $version = "0.2 (19.9.2005 ROC)"; +my $version = "0.2.7 (27.8.2010 ROC)"; my $help = "use: makemeta-vlp [options] file.xml options: @@ -19,6 +19,8 @@ options: -replace replace existing index files -online-mode mode for creating online/permanent files -archive-mode mode for creating archive/data files + -access=free adds free access tag for online-mode + -texttool adds texttool tag for online-mode "; logger("INFO", "makemeta-vlp $version"); @@ -27,7 +29,9 @@ logger("INFO", "makemeta-vlp $version"); # generic mappings at top level my %gen_map = ( - 'Custom2_Language' => 'meta/lang' + 'Custom2_Language' => 'meta/lang', + 'productionComment' => 'meta/image-acquisition/production-comment', + 'derivedFrom' => 'derived-from/archive-path' ); # sub type switch tag my %type_map = ( @@ -47,12 +51,25 @@ my %subtype_map = ( 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 'Pages' => 'meta/bib/number-of-pages' }, + '(Book)' => { + '_name' => 'book', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'Pages' => 'meta/bib/number-of-pages', + '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' + }, 'Book Section' => { '_name' => 'inbook', 'Author' => 'meta/bib/author', 'Title' => 'meta/bib/title', 'Year' => 'meta/bib/year', - 'Secondary_Title' => 'meta/bib/book-title', + 'SecondaryTitle' => 'meta/bib/book-title', 'SecondaryAuthor' => 'meta/bib/editor', 'Volume' => 'meta/bib/volume', 'NumberOfVolumes' => 'meta/bib/number-of-volumes', @@ -68,6 +85,19 @@ my %subtype_map = ( 'Edition' => 'meta/bib/edition', 'Volume' => 'meta/bib/volume', 'NumberOfVolumes' => 'meta/bib/number-of-volumes', + 'Pages' => 'meta/bib/number-of-pages', + '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' + }, + '(Edited Book)' => { + '_name' => 'edited-book', + 'Author' => 'meta/bib/editor', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Place_Published' => 'meta/bib/city', + 'Publisher' => 'meta/bib/publisher', + 'Edition' => 'meta/bib/edition', + 'Volume' => 'meta/bib/volume', + 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 'Pages' => 'meta/bib/number-of-pages' }, 'Journal Article' => { @@ -80,6 +110,27 @@ my %subtype_map = ( 'Number_Issue' => 'meta/bib/issue', 'Pages' => 'meta/bib/pages' }, + '(JournalVolume)' => { + '_name' => 'journal-volume', + 'SecondaryTitle' => 'meta/bib/title', + 'SecondaryAuthor' => 'meta/bib/editor', + 'Publisher' => 'meta/bib/publisher', + 'Place_Published' => 'meta/bib/city', + 'Year' => 'meta/bib/year', + 'Volume' => 'meta/bib/volume', + 'Pages' => 'meta/bib/number-of-pages', + '#Cover pages only, articles have been extracted' => 'meta/bib/comment' + }, + 'Journal' => { + '_name' => 'report', + 'Title' => 'meta/bib/title', + 'SecondaryTitle' => 'meta/bib/institution', + 'Author' => 'meta/bib/author', + 'Place_Published' => 'meta/bib/city', + 'Year' => 'meta/bib/year', + 'Date' => 'meta/bib/date', + 'Pages' => 'meta/bib/pages', + }, 'Magazine Article' => { '_name' => 'magazine-article', 'Author' => 'meta/bib/author', @@ -90,6 +141,15 @@ my %subtype_map = ( 'Date' => 'meta/bib/issue-date', 'Pages' => 'meta/bib/pages' }, + 'Newspaper Article' => { + '_name' => 'newspaper-article', + 'Author' => 'meta/bib/author', + 'Title' => 'meta/bib/title', + 'Year' => 'meta/bib/year', + 'Secondary_Title' => 'meta/bib/newspaper', + 'Date' => 'meta/bib/issue-date', + 'Pages' => 'meta/bib/pages' + }, 'Report' => { '_name' => 'report', 'Author' => 'meta/bib/author', @@ -143,10 +203,14 @@ my %lang_map = ( 'Japanese' => 'ja', 'Dutch' => 'nl', 'Spanish' => 'es', - 'Swedish' => 'sv' + 'Swedish' => 'sv', + 'Russian' => 'ru', + 'Polish' => 'pl', + 'Greek' => 'el' ); # storage fields my $arch_id_field = 'ID'; +my $access_free_field = 'online'; ####################################################### # internal parameters @@ -182,6 +246,15 @@ logger('DEBUG', "online_mode: $online_mo my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; logger('DEBUG', "archive_mode: $archive_mode"); +# create texttool tag (online mode only) +my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; +logger('DEBUG', "texttool: $texttool"); +# image dir for texttool +my $texttool_img_dir = "pages"; + +# access type +my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; + # index.meta namespace (not really implemented!) my $namespace = ""; @@ -229,7 +302,7 @@ sub find_arch_dir { sub find_permanent_dir { my ($input_node) = @_; - my $online_base = '/mpiwg/online/permanent'; + my $online_base = $lib_online_dir; my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); if (! $dest_id) { logger('ERROR', "no ID field for online permanent entry"); @@ -237,7 +310,11 @@ sub find_permanent_dir { return; } my $dir = "$online_base/lit$dest_id"; - return $dir; + if (-d $dir) { + logger('DEBUG', "directory $dir exists"); + return $dir; + } + return; } @@ -301,6 +378,13 @@ sub convert_bib { $cnt++; } } + # append additional constant fields (beginning with #) + foreach my $k (keys %{$subtype_map{$type}}) { + if ($k =~ /^\#(.*)/) { + my $val = $1; + create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); + } + } } return $cnt; } @@ -363,14 +447,38 @@ sub process_fm_entry { create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); - # image acquisition - create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace); - create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace); - create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace); } # media create_text_path('media-type', 'image', $index_root, $namespace); create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); + # access + if ($access_type) { + if ($access_type eq "free") { + create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); + } else { + my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); + create_text_path('name', $access_type, $acc_tag, $namespace); + } + } elsif ($online_mode) { + # read access conditions from "online" field in DB dump + my $online = sstrip($input_node->findvalue("fm:$access_free_field")); + if ($online) { + create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); + } else { + my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); + create_text_path('name', 'mpiwg', $acc_tag, $namespace); + } + } + + # texttool tag with image dir + if ($online_mode && $texttool) { + if ( -d "$doc_dir/$texttool_img_dir" ) { + create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); + } else { + logger('WARNING', "page image directory missing!"); + $warncnt++; + } + } # convert bib entries my $cnt = convert_bib($input_node, $index_root, $index_doc);