version 1.1, 2005/05/26 14:31:28
|
version 1.8, 2006/05/16 18:58:21
|
Line 9 use MPIWGStor;
|
Line 9 use MPIWGStor;
|
# make output unbuffered |
# make output unbuffered |
$|=1; |
$|=1; |
|
|
####################################################### |
|
# internal parameters |
|
# |
|
|
|
# program version |
# program version |
my $version = "0.1.0 (24.5.2005)"; |
my $version = "0.2.3 (16.5.2006 ROC)"; |
|
my $help = |
|
"use: makemeta-lib [options] file.xml |
|
options: |
|
-debug show debugging info |
|
-dry-run simulate, dont'do anything |
|
-online-mode mode for creating online/permanent files |
|
-cw-mode mode for copying einstein_cw archive documents |
|
-digifiles-mode mode for copying files from digifiles |
|
-map-file=mapfile.xml digilib mapping file (for digifiles mode) |
|
"; |
logger("INFO", "makemeta-lib $version"); |
logger("INFO", "makemeta-lib $version"); |
|
|
# |
########################################### |
# mappings |
# mappings |
# |
|
# generic mappings at top level |
# generic mappings at top level |
my %gen_map = ( |
my %gen_map = ( |
'Device' => 'meta/image-acquisition/device', |
'Device' => 'meta/image-acquisition/device', |
Line 96 my %lang_map = (
|
Line 102 my %lang_map = (
|
'English' => 'en', |
'English' => 'en', |
'Italian' => 'it', |
'Italian' => 'it', |
'French' => 'fr', |
'French' => 'fr', |
'Latin' => 'la' |
'Latin' => 'la', |
|
'Japanese' => 'ja', |
|
'Dutch' => 'nl', |
|
'Spanish' => 'es' |
); |
); |
# storage fields |
# storage fields |
my $arch_id_field = 'ID_Archive'; |
my $arch_id_field = 'ID_Archive'; |
my $online_url_field = 'URL'; |
my $online_url_field = 'URL'; |
|
my $online_id_field = 'ID_OnlinePermanent'; |
|
|
|
####################################################### |
|
# internal parameters |
|
# |
|
|
# more storage |
# storage |
my $lib_arch_dir = '/mpiwg/archive/data/library'; |
my $lib_arch_dir = '/mpiwg/archive/data/library'; |
my $lib_online_dir = '/mpiwg/online/permanent'; |
my $lib_online_dir = '/mpiwg/online/permanent'; |
|
my $lib_digilib_path = 'permanent'; |
|
my $digifiles_dir = "/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid"; |
|
|
# read command line parameters |
# read command line parameters |
my $args = MPIWGStor::parseargs; |
my $args = MPIWGStor::parseargs; |
|
if (! scalar(%$args)) { |
|
print $help, "\n"; |
|
exit 1; |
|
} |
|
|
# debug level |
# debug level |
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; |
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; |
|
|
|
# simulate action only |
|
my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; |
|
logger('DEBUG', "dry-run: $dry_run"); |
|
|
|
# use online mode |
|
my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; |
|
logger('DEBUG', "online_mode: $online_mode"); |
|
|
# use einstein-cw mode |
# use einstein-cw mode |
my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0; |
my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0; |
|
logger('DEBUG', "cw_mode: $cw_mode"); |
|
|
|
# use digifiles mode |
|
my $digifiles_mode = (exists $$args{'digifiles-mode'}) ? $$args{'digifiles-mode'} : 0; |
|
logger('DEBUG', "digifiles_mode: $digifiles_mode"); |
|
# digilib mapping file |
|
my $map_file_name = (exists $$args{'map-file'}) ? $$args{'map-file'} : ""; |
|
logger('DEBUG', "map_file_name: $map_file_name"); |
|
my $mapping_doc; |
|
my $mapping_root; |
|
|
# index.meta namespace (not really implemented!) |
# index.meta namespace (not really implemented!) |
my $namespace = ""; |
my $namespace = ""; |
Line 144 if (! -f $infile) {
|
Line 181 if (! -f $infile) {
|
# subroutines |
# subroutines |
# |
# |
|
|
|
|
|
sub add_digilib_mapping { |
|
my ($src_dir, $dest_dir) = @_; |
|
my $elem = $mapping_root->addNewChild($namespace, 'mapping'); |
|
$elem->addChild($mapping_doc->createAttributeNS($namespace, 'link', $src_dir)); |
|
$elem->addChild($mapping_doc->createAttributeNS($namespace, 'dir', $dest_dir)); |
|
if ($map_file_name) { |
|
write_xml($mapping_doc, $map_file_name); |
|
} else { |
|
logger('ABORT', "unable to write mapping file!"); |
|
exit 1; |
|
} |
|
} |
|
|
|
sub find_digifiles_dir { |
|
my ($input_node) = @_; |
|
my $digifiles_base = '/net/digifiles.mpiwg-berlin.mpg.de/Volumes/raid'; |
|
my $src_dir = find_online_dir($input_node, $digifiles_base, ''); |
|
if (! $src_dir) { |
|
logger('ERROR', "no online directory for digifiles entry"); |
|
$errcnt++; |
|
return; |
|
} |
|
my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field")); |
|
if (! $dest_id) { |
|
logger('ERROR', "no ID field for digifiles entry"); |
|
$errcnt++; |
|
return; |
|
} |
|
my $dir = "$lib_online_dir/library/$dest_id"; |
|
my $map_dir = "$lib_digilib_path/library/$dest_id"; |
|
if ($dry_run) { |
|
logger('DEBUG', "would move $digifiles_base/$src_dir to $dir"); |
|
add_digilib_mapping($src_dir, "$map_dir/pageimg"); |
|
return $dir; |
|
} else { |
|
logger('INFO', "moving $digifiles_base/$src_dir to $dir"); |
|
logger('DEBUG', "mkdir $dir/pageimg"); |
|
if (system("mkdir -p $dir/pageimg && chmod -R 0775 $dir") == 0) { |
|
logger('DEBUG', "cp $digifiles_base/$src_dir $dir/pageimg"); |
|
if (system("cp -rp $digifiles_base/$src_dir/* $dir/pageimg/") == 0) { |
|
if (-d "$dir/pageimg") { |
|
logger('DEBUG', "directory $dir OK"); |
|
add_digilib_mapping($src_dir, "$map_dir/pageimg"); |
|
if (system("rm -rf $digifiles_base/$src_dir/* && rm -rf $digifiles_base/$src_dir") == 0) { |
|
logger('DEBUG', "directory $digifiles_base/$src_dir removed"); |
|
return $dir; |
|
} else { |
|
logger('ERROR', "unable to remove source directory $digifiles_base/$src_dir!"); |
|
$errcnt++; |
|
return $dir; |
|
} |
|
} |
|
} |
|
} |
|
logger('ABORT', "unable to copy directory $src_dir to $dir!"); |
|
exit 1; |
|
} |
|
return; |
|
} |
|
|
sub find_cw_dir { |
sub find_cw_dir { |
my ($input_node) = @_; |
my ($input_node) = @_; |
my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup'); |
my $cw_base = '/mpiwg/archive/data/library/inbox/zwischen_backup'; |
my $dest_id = $input_node->findvalue("fm:$arch_id_field"); |
my $src_dir = find_online_dir($input_node, $cw_base, 'pageimg'); |
|
my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
if (! $dest_id) { |
if (! $dest_id) { |
logger('ERROR', "no ID field for einstein-cw entry"); |
logger('ERROR', "no ID field for einstein-cw entry"); |
$errcnt++; |
$errcnt++; |
return; |
return; |
} |
} |
my $dir = "$lib_arch_dir/$dest_id"; |
my $dir = "$lib_arch_dir/$dest_id"; |
logger('DEBUG', "moving $src_dir to $dir"); |
if ($dry_run) { |
if (rename $src_dir, $dir) { |
logger('DEBUG', "would move $cw_base/$src_dir to $dir"); |
|
return $dir; |
|
} else { |
|
logger('DEBUG', "moving $cw_base/$src_dir to $dir"); |
|
if (rename "$cw_base/$src_dir", $dir) { |
if (-d $dir) { |
if (-d $dir) { |
logger('DEBUG', "directory $dir OK"); |
logger('DEBUG', "directory $dir OK"); |
return $dir; |
return $dir; |
} |
} |
} else { |
} else { |
logger('ABORT', "unable to rename directory $src_dir to $dir!"); |
logger('ABORT', "unable to rename directory $cw_base/$src_dir to $dir!"); |
exit 1; |
exit 1; |
} |
} |
|
} |
return; |
return; |
} |
} |
|
|
|
sub find_permanent_dir { |
|
my ($input_node) = @_; |
|
my $online_base = '/mpiwg/online/permanent'; |
|
my $src_dir = find_online_dir($input_node, $online_base, 'pageimg'); |
|
my $dest_id = sstrip($input_node->findvalue("fm:$online_id_field")); |
|
if (! $dest_id) { |
|
logger('ERROR', "no ID field for online permanent entry"); |
|
$errcnt++; |
|
return; |
|
} |
|
my $dir = "$online_base/$src_dir"; |
|
return $dir; |
|
} |
|
|
|
# |
|
# $dir = find_online_dir($input_node, $base_dir, $page_dir) |
|
# |
|
# Takes the path from the $online_url_field of the $input_node document |
|
# and looks in the directory $base_dir for it. Strips $page_dir from the end. |
|
# Returns the directory path sans $base_dir if it exists |
|
# |
sub find_online_dir { |
sub find_online_dir { |
my ($input_node, $base_dir) = @_; |
my ($input_node, $base_dir, $page_dir) = @_; |
$base_dir = $lib_online_dir unless ($base_dir); |
$base_dir = $lib_online_dir unless ($base_dir); |
|
|
my $online_url = $input_node->findvalue("fm:$online_url_field"); |
my $online_url = $input_node->findvalue("fm:$online_url_field"); |
if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) { |
logger('DEBUG', "checking URL: $online_url"); |
my $online_dir = $1; |
my $online_dir; |
|
if ($online_url =~ /fn=permanent\/(.+)/) { |
|
# new style digilib URL |
|
$online_dir = $1; |
|
} elsif ($online_url =~ /\?([^\+]+)\+/) { |
|
# old style digilib URL |
|
$online_dir = $1; |
|
} |
|
#logger('DEBUG', "online_dir1: $online_dir"); |
|
if ($online_dir) { |
|
$online_dir =~ s/\/$//; # strip ending slashes |
|
if ($page_dir) { |
|
$online_dir =~ s/\/${page_dir}$//; |
|
} |
#logger("DEBUG", "dir: $base_dir/$online_dir"); |
#logger("DEBUG", "dir: $base_dir/$online_dir"); |
my $dir = "$base_dir/$online_dir"; |
if (-d "$base_dir/$online_dir") { |
if (-d $dir) { |
logger('DEBUG', "directory $base_dir/$online_dir exists"); |
logger('DEBUG', "directory $dir exists"); |
return $online_dir; |
return $dir; |
|
} |
} |
} |
} |
return; |
return; |
Line 268 sub process_all_fm_entries {
|
Line 405 sub process_all_fm_entries {
|
foreach my $n ($input_root->findnodes('fm:ROW')) { |
foreach my $n ($input_root->findnodes('fm:ROW')) { |
logger('INFO', "processing entry $cnt ..."); |
logger('INFO', "processing entry $cnt ..."); |
process_fm_entry($n); |
process_fm_entry($n); |
|
$cnt++; |
} |
} |
} |
} |
|
|
Line 279 sub process_fm_entry {
|
Line 417 sub process_fm_entry {
|
$index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); |
$index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); |
$index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); |
$index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); |
$index_doc->setDocumentElement($index_root); |
$index_doc->setDocumentElement($index_root); |
|
my $derived_from = ""; |
|
|
# try to find the document directory |
# try to find the document directory |
my $doc_dir = ""; |
my $doc_dir = ""; |
if ($cw_mode) { |
if ($online_mode) { |
|
$doc_dir = find_permanent_dir($input_node); |
|
$derived_from = find_arch_dir($input_node); |
|
} elsif ($cw_mode) { |
$doc_dir = find_cw_dir($input_node); |
$doc_dir = find_cw_dir($input_node); |
|
} elsif ($digifiles_mode) { |
|
$doc_dir = find_digifiles_dir($input_node); |
} else { |
} else { |
$doc_dir = find_arch_dir($input_node); |
$doc_dir = find_arch_dir($input_node); |
} |
} |
Line 308 sub process_fm_entry {
|
Line 452 sub process_fm_entry {
|
# media |
# media |
create_text_path('media-type', 'image', $index_root, $namespace); |
create_text_path('media-type', 'image', $index_root, $namespace); |
create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); |
create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); |
|
# derived-from |
|
if ($derived_from) { |
|
create_text_path('derived-from/archive-path', $derived_from, $index_root, $namespace); |
|
} |
|
|
# convert bib entries |
# convert bib entries |
my $cnt = convert_bib($input_node, $index_root, $index_doc); |
my $cnt = convert_bib($input_node, $index_root, $index_doc); |
Line 319 sub process_fm_entry {
|
Line 467 sub process_fm_entry {
|
} |
} |
|
|
# write new index.meta file |
# write new index.meta file |
|
if ($dry_run) { |
|
logger('DEBUG', "would write $doc_dir/index.meta"); |
|
logger('DEBUG', $index_doc->toString(1)); |
|
} else { |
write_xml($index_doc, "$doc_dir/index.meta"); |
write_xml($index_doc, "$doc_dir/index.meta"); |
|
} |
|
|
} |
} |
|
|
Line 337 my ($input_doc, $input_root) = read_xml(
|
Line 490 my ($input_doc, $input_root) = read_xml(
|
my $fm_namespace = $input_root->namespaceURI(); |
my $fm_namespace = $input_root->namespaceURI(); |
$input_root->setNamespace($fm_namespace, 'fm', 1); |
$input_root->setNamespace($fm_namespace, 'fm', 1); |
|
|
|
# create digilib mapping file for digifiles mode |
|
if ($digifiles_mode) { |
|
$mapping_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); |
|
$mapping_root = $mapping_doc->createElementNS($namespace, 'digilib-aliases'); |
|
$mapping_doc->setDocumentElement($mapping_root); |
|
#<mapping link="exp1/archimedes_image_repository/archimedes_large/achil_propo_087_la_1545" dir="permanent/archimedes_repository/large/achil_propo_087_la_1545"/> |
|
|
|
} |
|
|
process_all_fm_entries($input_root); |
process_all_fm_entries($input_root); |
|
|
|
|