|
|
| version 1.4, 2006/06/20 16:23:26 | version 1.10, 2017/03/16 17:00:43 |
|---|---|
| Line 1 | Line 1 |
| #!/usr/local/bin/perl -w | #!/usr/bin/perl -w |
| use strict; | use strict; |
| use XML::LibXML; | use XML::LibXML; |
| Line 10 use MPIWGStor; | Line 10 use MPIWGStor; |
| $|=1; | $|=1; |
| # program version | # program version |
| my $version = "0.2.2 (20.6.2006 ROC)"; | my $version = "0.2.7 (27.8.2010 ROC)"; |
| my $help = | my $help = |
| "use: makemeta-vlp [options] file.xml | "use: makemeta-vlp [options] file.xml |
| options: | options: |
| Line 19 options: | Line 19 options: |
| -replace replace existing index files | -replace replace existing index files |
| -online-mode mode for creating online/permanent files | -online-mode mode for creating online/permanent files |
| -archive-mode mode for creating archive/data files | -archive-mode mode for creating archive/data files |
| -access=free adds free access tag for online-mode | |
| -texttool adds texttool tag for online-mode | |
| "; | "; |
| logger("INFO", "makemeta-vlp $version"); | logger("INFO", "makemeta-vlp $version"); |
| Line 27 logger("INFO", "makemeta-vlp $version"); | Line 29 logger("INFO", "makemeta-vlp $version"); |
| # generic mappings at top level | # generic mappings at top level |
| my %gen_map = ( | my %gen_map = ( |
| 'Custom2_Language' => 'meta/lang' | 'Custom2_Language' => 'meta/lang', |
| 'productionComment' => 'meta/image-acquisition/production-comment', | |
| 'derivedFrom' => 'derived-from/archive-path' | |
| ); | ); |
| # sub type switch tag | # sub type switch tag |
| my %type_map = ( | my %type_map = ( |
| Line 106 my %subtype_map = ( | Line 110 my %subtype_map = ( |
| 'Number_Issue' => 'meta/bib/issue', | 'Number_Issue' => 'meta/bib/issue', |
| 'Pages' => 'meta/bib/pages' | 'Pages' => 'meta/bib/pages' |
| }, | }, |
| '(JournalVolume)' => { | |
| '_name' => 'journal-volume', | |
| 'SecondaryTitle' => 'meta/bib/title', | |
| 'SecondaryAuthor' => 'meta/bib/editor', | |
| 'Publisher' => 'meta/bib/publisher', | |
| 'Place_Published' => 'meta/bib/city', | |
| 'Year' => 'meta/bib/year', | |
| 'Volume' => 'meta/bib/volume', | |
| 'Pages' => 'meta/bib/number-of-pages', | |
| '#Cover pages only, articles have been extracted' => 'meta/bib/comment' | |
| }, | |
| 'Journal' => { | |
| '_name' => 'report', | |
| 'Title' => 'meta/bib/title', | |
| 'SecondaryTitle' => 'meta/bib/institution', | |
| 'Author' => 'meta/bib/author', | |
| 'Place_Published' => 'meta/bib/city', | |
| 'Year' => 'meta/bib/year', | |
| 'Date' => 'meta/bib/date', | |
| 'Pages' => 'meta/bib/pages', | |
| }, | |
| 'Magazine Article' => { | 'Magazine Article' => { |
| '_name' => 'magazine-article', | '_name' => 'magazine-article', |
| 'Author' => 'meta/bib/author', | 'Author' => 'meta/bib/author', |
| Line 116 my %subtype_map = ( | Line 141 my %subtype_map = ( |
| 'Date' => 'meta/bib/issue-date', | 'Date' => 'meta/bib/issue-date', |
| 'Pages' => 'meta/bib/pages' | 'Pages' => 'meta/bib/pages' |
| }, | }, |
| 'Newspaper Article' => { | |
| '_name' => 'newspaper-article', | |
| 'Author' => 'meta/bib/author', | |
| 'Title' => 'meta/bib/title', | |
| 'Year' => 'meta/bib/year', | |
| 'Secondary_Title' => 'meta/bib/newspaper', | |
| 'Date' => 'meta/bib/issue-date', | |
| 'Pages' => 'meta/bib/pages' | |
| }, | |
| 'Report' => { | 'Report' => { |
| '_name' => 'report', | '_name' => 'report', |
| 'Author' => 'meta/bib/author', | 'Author' => 'meta/bib/author', |
| Line 169 my %lang_map = ( | Line 203 my %lang_map = ( |
| 'Japanese' => 'ja', | 'Japanese' => 'ja', |
| 'Dutch' => 'nl', | 'Dutch' => 'nl', |
| 'Spanish' => 'es', | 'Spanish' => 'es', |
| 'Swedish' => 'sv' | 'Swedish' => 'sv', |
| 'Russian' => 'ru', | |
| 'Polish' => 'pl', | |
| 'Greek' => 'el' | |
| ); | ); |
| # storage fields | # storage fields |
| my $arch_id_field = 'ID'; | my $arch_id_field = 'ID'; |
| my $access_free_field = 'online'; | |
| ####################################################### | ####################################################### |
| # internal parameters | # internal parameters |
| Line 208 logger('DEBUG', "online_mode: $online_mo | Line 246 logger('DEBUG', "online_mode: $online_mo |
| my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; | my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; |
| logger('DEBUG', "archive_mode: $archive_mode"); | logger('DEBUG', "archive_mode: $archive_mode"); |
| # create texttool tag (online mode only) | |
| my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; | |
| logger('DEBUG', "texttool: $texttool"); | |
| # image dir for texttool | |
| my $texttool_img_dir = "pages"; | |
| # access type | |
| my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; | |
| # index.meta namespace (not really implemented!) | # index.meta namespace (not really implemented!) |
| my $namespace = ""; | my $namespace = ""; |
| Line 255 sub find_arch_dir { | Line 302 sub find_arch_dir { |
| sub find_permanent_dir { | sub find_permanent_dir { |
| my ($input_node) = @_; | my ($input_node) = @_; |
| my $online_base = '/mpiwg/online/permanent'; | my $online_base = $lib_online_dir; |
| my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); | my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); |
| if (! $dest_id) { | if (! $dest_id) { |
| logger('ERROR', "no ID field for online permanent entry"); | logger('ERROR', "no ID field for online permanent entry"); |
| Line 263 sub find_permanent_dir { | Line 310 sub find_permanent_dir { |
| return; | return; |
| } | } |
| my $dir = "$online_base/lit$dest_id"; | my $dir = "$online_base/lit$dest_id"; |
| if (-d $dir) { | |
| logger('DEBUG', "directory $dir exists"); | |
| return $dir; | return $dir; |
| } | } |
| return; | |
| } | |
| sub convert_bib { | sub convert_bib { |
| Line 396 sub process_fm_entry { | Line 447 sub process_fm_entry { |
| create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); | create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); |
| create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); | create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); |
| create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); | create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); |
| # image acquisition | |
| create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace); | |
| create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace); | |
| create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace); | |
| } | } |
| # media | # media |
| create_text_path('media-type', 'image', $index_root, $namespace); | create_text_path('media-type', 'image', $index_root, $namespace); |
| create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); | create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); |
| # access | |
| if ($access_type) { | |
| if ($access_type eq "free") { | |
| create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); | |
| } else { | |
| my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); | |
| create_text_path('name', $access_type, $acc_tag, $namespace); | |
| } | |
| } elsif ($online_mode) { | |
| # read access conditions from "online" field in DB dump | |
| my $online = sstrip($input_node->findvalue("fm:$access_free_field")); | |
| if ($online) { | |
| create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); | |
| } else { | |
| my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); | |
| create_text_path('name', 'mpiwg', $acc_tag, $namespace); | |
| } | |
| } | |
| # texttool tag with image dir | |
| if ($online_mode && $texttool) { | |
| if ( -d "$doc_dir/$texttool_img_dir" ) { | |
| create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); | |
| } else { | |
| logger('WARNING', "page image directory missing!"); | |
| $warncnt++; | |
| } | |
| } | |
| # convert bib entries | # convert bib entries |
| my $cnt = convert_bib($input_node, $index_root, $index_doc); | my $cnt = convert_bib($input_node, $index_root, $index_doc); |