version 1.1, 2005/09/20 17:32:06
|
version 1.10, 2017/03/16 17:00:43
|
Line 1
|
Line 1
|
#!/usr/local/bin/perl -w |
#!/usr/bin/perl -w |
|
|
use strict; |
use strict; |
use XML::LibXML; |
use XML::LibXML; |
|
|
use lib '/usr/local/mpiwg/archive_devel'; |
use lib '/usr/local/mpiwg/archive'; |
use MPIWGStor; |
use MPIWGStor; |
|
|
# make output unbuffered |
# make output unbuffered |
$|=1; |
$|=1; |
|
|
# program version |
# program version |
my $version = "0.2 (19.9.2005 ROC)"; |
my $version = "0.2.7 (27.8.2010 ROC)"; |
my $help = |
my $help = |
"use: makemeta-vlp [options] file.xml |
"use: makemeta-vlp [options] file.xml |
options: |
options: |
Line 19 options:
|
Line 19 options:
|
-replace replace existing index files |
-replace replace existing index files |
-online-mode mode for creating online/permanent files |
-online-mode mode for creating online/permanent files |
-archive-mode mode for creating archive/data files |
-archive-mode mode for creating archive/data files |
|
-access=free adds free access tag for online-mode |
|
-texttool adds texttool tag for online-mode |
"; |
"; |
logger("INFO", "makemeta-vlp $version"); |
logger("INFO", "makemeta-vlp $version"); |
|
|
Line 27 logger("INFO", "makemeta-vlp $version");
|
Line 29 logger("INFO", "makemeta-vlp $version");
|
|
|
# generic mappings at top level |
# generic mappings at top level |
my %gen_map = ( |
my %gen_map = ( |
'Custom2_Language' => 'meta/lang' |
'Custom2_Language' => 'meta/lang', |
|
'productionComment' => 'meta/image-acquisition/production-comment', |
|
'derivedFrom' => 'derived-from/archive-path' |
); |
); |
# sub type switch tag |
# sub type switch tag |
my %type_map = ( |
my %type_map = ( |
Line 47 my %subtype_map = (
|
Line 51 my %subtype_map = (
|
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
'Pages' => 'meta/bib/number-of-pages' |
'Pages' => 'meta/bib/number-of-pages' |
}, |
}, |
|
'(Book)' => { |
|
'_name' => 'book', |
|
'Author' => 'meta/bib/author', |
|
'Title' => 'meta/bib/title', |
|
'Year' => 'meta/bib/year', |
|
'Place_Published' => 'meta/bib/city', |
|
'Publisher' => 'meta/bib/publisher', |
|
'Edition' => 'meta/bib/edition', |
|
'Volume' => 'meta/bib/volume', |
|
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
|
'Pages' => 'meta/bib/number-of-pages', |
|
'#Cover pages only, book sections have been extracted' => 'meta/bib/comment' |
|
}, |
'Book Section' => { |
'Book Section' => { |
'_name' => 'inbook', |
'_name' => 'inbook', |
'Author' => 'meta/bib/author', |
'Author' => 'meta/bib/author', |
'Title' => 'meta/bib/title', |
'Title' => 'meta/bib/title', |
'Year' => 'meta/bib/year', |
'Year' => 'meta/bib/year', |
'Secondary_Title' => 'meta/bib/book-title', |
'SecondaryTitle' => 'meta/bib/book-title', |
'SecondaryAuthor' => 'meta/bib/editor', |
'SecondaryAuthor' => 'meta/bib/editor', |
'Volume' => 'meta/bib/volume', |
'Volume' => 'meta/bib/volume', |
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
Line 68 my %subtype_map = (
|
Line 85 my %subtype_map = (
|
'Edition' => 'meta/bib/edition', |
'Edition' => 'meta/bib/edition', |
'Volume' => 'meta/bib/volume', |
'Volume' => 'meta/bib/volume', |
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
|
'Pages' => 'meta/bib/number-of-pages', |
|
'#Cover pages only, book sections have been extracted' => 'meta/bib/comment' |
|
}, |
|
'(Edited Book)' => { |
|
'_name' => 'edited-book', |
|
'Author' => 'meta/bib/editor', |
|
'Title' => 'meta/bib/title', |
|
'Year' => 'meta/bib/year', |
|
'Place_Published' => 'meta/bib/city', |
|
'Publisher' => 'meta/bib/publisher', |
|
'Edition' => 'meta/bib/edition', |
|
'Volume' => 'meta/bib/volume', |
|
'NumberOfVolumes' => 'meta/bib/number-of-volumes', |
'Pages' => 'meta/bib/number-of-pages' |
'Pages' => 'meta/bib/number-of-pages' |
}, |
}, |
'Journal Article' => { |
'Journal Article' => { |
Line 80 my %subtype_map = (
|
Line 110 my %subtype_map = (
|
'Number_Issue' => 'meta/bib/issue', |
'Number_Issue' => 'meta/bib/issue', |
'Pages' => 'meta/bib/pages' |
'Pages' => 'meta/bib/pages' |
}, |
}, |
|
'(JournalVolume)' => { |
|
'_name' => 'journal-volume', |
|
'SecondaryTitle' => 'meta/bib/title', |
|
'SecondaryAuthor' => 'meta/bib/editor', |
|
'Publisher' => 'meta/bib/publisher', |
|
'Place_Published' => 'meta/bib/city', |
|
'Year' => 'meta/bib/year', |
|
'Volume' => 'meta/bib/volume', |
|
'Pages' => 'meta/bib/number-of-pages', |
|
'#Cover pages only, articles have been extracted' => 'meta/bib/comment' |
|
}, |
|
'Journal' => { |
|
'_name' => 'report', |
|
'Title' => 'meta/bib/title', |
|
'SecondaryTitle' => 'meta/bib/institution', |
|
'Author' => 'meta/bib/author', |
|
'Place_Published' => 'meta/bib/city', |
|
'Year' => 'meta/bib/year', |
|
'Date' => 'meta/bib/date', |
|
'Pages' => 'meta/bib/pages', |
|
}, |
'Magazine Article' => { |
'Magazine Article' => { |
'_name' => 'magazine-article', |
'_name' => 'magazine-article', |
'Author' => 'meta/bib/author', |
'Author' => 'meta/bib/author', |
Line 90 my %subtype_map = (
|
Line 141 my %subtype_map = (
|
'Date' => 'meta/bib/issue-date', |
'Date' => 'meta/bib/issue-date', |
'Pages' => 'meta/bib/pages' |
'Pages' => 'meta/bib/pages' |
}, |
}, |
|
'Newspaper Article' => { |
|
'_name' => 'newspaper-article', |
|
'Author' => 'meta/bib/author', |
|
'Title' => 'meta/bib/title', |
|
'Year' => 'meta/bib/year', |
|
'Secondary_Title' => 'meta/bib/newspaper', |
|
'Date' => 'meta/bib/issue-date', |
|
'Pages' => 'meta/bib/pages' |
|
}, |
'Report' => { |
'Report' => { |
'_name' => 'report', |
'_name' => 'report', |
'Author' => 'meta/bib/author', |
'Author' => 'meta/bib/author', |
Line 143 my %lang_map = (
|
Line 203 my %lang_map = (
|
'Japanese' => 'ja', |
'Japanese' => 'ja', |
'Dutch' => 'nl', |
'Dutch' => 'nl', |
'Spanish' => 'es', |
'Spanish' => 'es', |
'Swedish' => 'sv' |
'Swedish' => 'sv', |
|
'Russian' => 'ru', |
|
'Polish' => 'pl', |
|
'Greek' => 'el' |
); |
); |
# storage fields |
# storage fields |
my $arch_id_field = 'ID'; |
my $arch_id_field = 'ID'; |
|
my $access_free_field = 'online'; |
|
|
####################################################### |
####################################################### |
# internal parameters |
# internal parameters |
Line 182 logger('DEBUG', "online_mode: $online_mo
|
Line 246 logger('DEBUG', "online_mode: $online_mo
|
my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; |
my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; |
logger('DEBUG', "archive_mode: $archive_mode"); |
logger('DEBUG', "archive_mode: $archive_mode"); |
|
|
|
# create texttool tag (online mode only) |
|
my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; |
|
logger('DEBUG', "texttool: $texttool"); |
|
# image dir for texttool |
|
my $texttool_img_dir = "pages"; |
|
|
|
# access type |
|
my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; |
|
|
# index.meta namespace (not really implemented!) |
# index.meta namespace (not really implemented!) |
my $namespace = ""; |
my $namespace = ""; |
|
|
Line 229 sub find_arch_dir {
|
Line 302 sub find_arch_dir {
|
|
|
sub find_permanent_dir { |
sub find_permanent_dir { |
my ($input_node) = @_; |
my ($input_node) = @_; |
my $online_base = '/mpiwg/online/permanent'; |
my $online_base = $lib_online_dir; |
my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
if (! $dest_id) { |
if (! $dest_id) { |
logger('ERROR', "no ID field for online permanent entry"); |
logger('ERROR', "no ID field for online permanent entry"); |
Line 237 sub find_permanent_dir {
|
Line 310 sub find_permanent_dir {
|
return; |
return; |
} |
} |
my $dir = "$online_base/lit$dest_id"; |
my $dir = "$online_base/lit$dest_id"; |
|
if (-d $dir) { |
|
logger('DEBUG', "directory $dir exists"); |
return $dir; |
return $dir; |
} |
} |
|
return; |
|
} |
|
|
|
|
sub convert_bib { |
sub convert_bib { |
Line 301 sub convert_bib {
|
Line 378 sub convert_bib {
|
$cnt++; |
$cnt++; |
} |
} |
} |
} |
|
# append additional constant fields (beginning with #) |
|
foreach my $k (keys %{$subtype_map{$type}}) { |
|
if ($k =~ /^\#(.*)/) { |
|
my $val = $1; |
|
create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); |
|
} |
|
} |
} |
} |
return $cnt; |
return $cnt; |
} |
} |
Line 363 sub process_fm_entry {
|
Line 447 sub process_fm_entry {
|
create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); |
create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); |
create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); |
create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); |
create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); |
create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); |
# image acquisition |
|
create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace); |
|
create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace); |
|
create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace); |
|
} |
} |
# media |
# media |
create_text_path('media-type', 'image', $index_root, $namespace); |
create_text_path('media-type', 'image', $index_root, $namespace); |
create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); |
create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); |
|
# access |
|
if ($access_type) { |
|
if ($access_type eq "free") { |
|
create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); |
|
} else { |
|
my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); |
|
create_text_path('name', $access_type, $acc_tag, $namespace); |
|
} |
|
} elsif ($online_mode) { |
|
# read access conditions from "online" field in DB dump |
|
my $online = sstrip($input_node->findvalue("fm:$access_free_field")); |
|
if ($online) { |
|
create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); |
|
} else { |
|
my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); |
|
create_text_path('name', 'mpiwg', $acc_tag, $namespace); |
|
} |
|
} |
|
|
|
# texttool tag with image dir |
|
if ($online_mode && $texttool) { |
|
if ( -d "$doc_dir/$texttool_img_dir" ) { |
|
create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); |
|
} else { |
|
logger('WARNING', "page image directory missing!"); |
|
$warncnt++; |
|
} |
|
} |
|
|
# convert bib entries |
# convert bib entries |
my $cnt = convert_bib($input_node, $index_root, $index_doc); |
my $cnt = convert_bib($input_node, $index_root, $index_doc); |