![]() ![]() | ![]() |
added more bib-formats improved online_mode
1: #!/usr/local/bin/perl -w 2: 3: use strict; 4: use XML::LibXML; 5: 6: use lib '/usr/local/mpiwg/archive'; 7: use MPIWGStor; 8: 9: # make output unbuffered 10: $|=1; 11: 12: # program version 13: my $version = "0.2.7 (27.8.2010 ROC)"; 14: my $help = 15: "use: makemeta-vlp [options] file.xml 16: options: 17: -debug show debugging info 18: -dry-run simulate, dont'do anything 19: -replace replace existing index files 20: -online-mode mode for creating online/permanent files 21: -archive-mode mode for creating archive/data files 22: -access=free adds free access tag for online-mode 23: -texttool adds texttool tag for online-mode 24: "; 25: logger("INFO", "makemeta-vlp $version"); 26: 27: ########################################### 28: # mappings 29: 30: # generic mappings at top level 31: my %gen_map = ( 32: 'Custom2_Language' => 'meta/lang', 33: 'productionComment' => 'meta/image-acquisition/production-comment', 34: 'derivedFrom' => 'derived-from/archive-path' 35: ); 36: # sub type switch tag 37: my %type_map = ( 38: 'ReferenceType' => 'meta/bib@type' 39: ); 40: # sub type mappings 41: my %subtype_map = ( 42: 'Book' => { 43: '_name' => 'book', 44: 'Author' => 'meta/bib/author', 45: 'Title' => 'meta/bib/title', 46: 'Year' => 'meta/bib/year', 47: 'Place_Published' => 'meta/bib/city', 48: 'Publisher' => 'meta/bib/publisher', 49: 'Edition' => 'meta/bib/edition', 50: 'Volume' => 'meta/bib/volume', 51: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 52: 'Pages' => 'meta/bib/number-of-pages' 53: }, 54: '(Book)' => { 55: '_name' => 'book', 56: 'Author' => 'meta/bib/author', 57: 'Title' => 'meta/bib/title', 58: 'Year' => 'meta/bib/year', 59: 'Place_Published' => 'meta/bib/city', 60: 'Publisher' => 'meta/bib/publisher', 61: 'Edition' => 'meta/bib/edition', 62: 'Volume' => 'meta/bib/volume', 63: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 64: 'Pages' => 'meta/bib/number-of-pages', 65: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' 66: }, 67: 'Book Section' => { 68: '_name' => 'inbook', 69: 'Author' => 'meta/bib/author', 70: 'Title' => 'meta/bib/title', 71: 'Year' => 'meta/bib/year', 72: 'SecondaryTitle' => 'meta/bib/book-title', 73: 'SecondaryAuthor' => 'meta/bib/editor', 74: 'Volume' => 'meta/bib/volume', 75: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 76: 'Pages' => 'meta/bib/pages' 77: }, 78: 'Edited Book' => { 79: '_name' => 'edited-book', 80: 'Author' => 'meta/bib/editor', 81: 'Title' => 'meta/bib/title', 82: 'Year' => 'meta/bib/year', 83: 'Place_Published' => 'meta/bib/city', 84: 'Publisher' => 'meta/bib/publisher', 85: 'Edition' => 'meta/bib/edition', 86: 'Volume' => 'meta/bib/volume', 87: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 88: 'Pages' => 'meta/bib/number-of-pages', 89: '#Cover pages only, book sections have been extracted' => 'meta/bib/comment' 90: }, 91: '(Edited Book)' => { 92: '_name' => 'edited-book', 93: 'Author' => 'meta/bib/editor', 94: 'Title' => 'meta/bib/title', 95: 'Year' => 'meta/bib/year', 96: 'Place_Published' => 'meta/bib/city', 97: 'Publisher' => 'meta/bib/publisher', 98: 'Edition' => 'meta/bib/edition', 99: 'Volume' => 'meta/bib/volume', 100: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 101: 'Pages' => 'meta/bib/number-of-pages' 102: }, 103: 'Journal Article' => { 104: '_name' => 'journal-article', 105: 'Author' => 'meta/bib/author', 106: 'Title' => 'meta/bib/title', 107: 'Year' => 'meta/bib/year', 108: 'SecondaryTitle' => 'meta/bib/journal', 109: 'Volume' => 'meta/bib/volume', 110: 'Number_Issue' => 'meta/bib/issue', 111: 'Pages' => 'meta/bib/pages' 112: }, 113: '(JournalVolume)' => { 114: '_name' => 'journal-volume', 115: 'SecondaryTitle' => 'meta/bib/title', 116: 'SecondaryAuthor' => 'meta/bib/editor', 117: 'Publisher' => 'meta/bib/publisher', 118: 'Place_Published' => 'meta/bib/city', 119: 'Year' => 'meta/bib/year', 120: 'Volume' => 'meta/bib/volume', 121: 'Pages' => 'meta/bib/number-of-pages', 122: '#Cover pages only, articles have been extracted' => 'meta/bib/comment' 123: }, 124: 'Journal' => { 125: '_name' => 'report', 126: 'Title' => 'meta/bib/title', 127: 'SecondaryTitle' => 'meta/bib/institution', 128: 'Author' => 'meta/bib/author', 129: 'Place_Published' => 'meta/bib/city', 130: 'Year' => 'meta/bib/year', 131: 'Date' => 'meta/bib/date', 132: 'Pages' => 'meta/bib/pages', 133: }, 134: 'Magazine Article' => { 135: '_name' => 'magazine-article', 136: 'Author' => 'meta/bib/author', 137: 'Title' => 'meta/bib/title', 138: 'Year' => 'meta/bib/year', 139: 'Secondary_Title' => 'meta/bib/magazine', 140: 'Number_Issue' => 'meta/bib/issue-number', 141: 'Date' => 'meta/bib/issue-date', 142: 'Pages' => 'meta/bib/pages' 143: }, 144: 'Newspaper Article' => { 145: '_name' => 'newspaper-article', 146: 'Author' => 'meta/bib/author', 147: 'Title' => 'meta/bib/title', 148: 'Year' => 'meta/bib/year', 149: 'Secondary_Title' => 'meta/bib/newspaper', 150: 'Date' => 'meta/bib/issue-date', 151: 'Pages' => 'meta/bib/pages' 152: }, 153: 'Report' => { 154: '_name' => 'report', 155: 'Author' => 'meta/bib/author', 156: 'Title' => 'meta/bib/title', 157: 'Year' => 'meta/bib/year', 158: 'Place_Published' => 'meta/bib/city', 159: 'Date' => 'meta/bib/date', 160: 'SecondaryTitle' => 'meta/bib/type', 161: 'Pages' => 'meta/bib/pages' 162: }, 163: 'Trade Catalogue' => { 164: '_name' => 'report', 165: 'Author' => 'meta/bib/author', 166: 'Title' => 'meta/bib/title', 167: 'Year' => 'meta/bib/year', 168: 'Place_Published' => 'meta/bib/city', 169: 'Date' => 'meta/bib/date', 170: 'Volume' => 'meta/bib/volume', 171: 'NumberOfVolumes' => 'meta/bib/number-of-volumes', 172: 'ReferenceType' => 'meta/bib/type', 173: 'Pages' => 'meta/bib/pages' 174: }, 175: 'Thesis' => { 176: '_name' => 'thesis', 177: 'Author' => 'meta/bib/author', 178: 'Title' => 'meta/bib/title', 179: 'Place_Published' => 'meta/bib/city', 180: 'Publisher' => 'meta/bib/university', 181: 'Date' => 'meta/bib/date', 182: 'TypeOfWork' => 'meta/bib/type', 183: 'Pages' => 'meta/bib/number-of-pages' 184: }, 185: 'Manuscript' => { 186: '_name' => 'manuscript', 187: 'Author' => 'meta/bib/author', 188: 'Title' => 'meta/bib/title', 189: 'Year' => 'meta/bib/year', 190: 'Place_Published' => 'meta/bib/location', 191: 'Pages' => 'meta/bib/pages' 192: } 193: ); 194: # language element 195: my $lang_field = 'Custom2_Language'; 196: # languages to iso codes 197: my %lang_map = ( 198: 'German' => 'de', 199: 'English' => 'en', 200: 'Italian' => 'it', 201: 'French' => 'fr', 202: 'Latin' => 'la', 203: 'Japanese' => 'ja', 204: 'Dutch' => 'nl', 205: 'Spanish' => 'es', 206: 'Swedish' => 'sv', 207: 'Russian' => 'ru', 208: 'Polish' => 'pl', 209: 'Greek' => 'el' 210: ); 211: # storage fields 212: my $arch_id_field = 'ID'; 213: my $access_free_field = 'online'; 214: 215: ####################################################### 216: # internal parameters 217: # 218: 219: # storage 220: my $lib_arch_dir = '/mpiwg/archive/data/vlp'; 221: my $lib_online_dir = '/mpiwg/online/permanent/vlp'; 222: 223: # read command line parameters 224: my $args = MPIWGStor::parseargs; 225: if (! scalar(%$args)) { 226: print $help, "\n"; 227: exit 1; 228: } 229: 230: # debug level 231: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; 232: 233: # simulate action only 234: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0; 235: logger('DEBUG', "dry-run: $dry_run"); 236: 237: # replace existing index files 238: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0; 239: logger('DEBUG', "replace: $do_replace"); 240: 241: # use online mode 242: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0; 243: logger('DEBUG', "online_mode: $online_mode"); 244: 245: # use archive mode 246: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0; 247: logger('DEBUG', "archive_mode: $archive_mode"); 248: 249: # create texttool tag (online mode only) 250: my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1; 251: logger('DEBUG', "texttool: $texttool"); 252: # image dir for texttool 253: my $texttool_img_dir = "pages"; 254: 255: # access type 256: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : ""; 257: 258: # index.meta namespace (not really implemented!) 259: my $namespace = ""; 260: 261: 262: my $xml_changed = 0; 263: my $errcnt = 0; 264: my $warncnt = 0; 265: 266: ####################################################### 267: # check parameters that were passed to the program 268: # 269: my $infile = $$args{'path'}; 270: if (! $infile) { 271: logger("ABORT", "no input file given!"); 272: exit 1; 273: } 274: # strip double slashes 275: $infile =~ s/\/\//\//; 276: if (! -f $infile) { 277: logger("ABORT", "input file \'$infile\' doesn't exist!"); 278: exit 1; 279: } 280: 281: 282: ####################################################### 283: # subroutines 284: # 285: 286: 287: sub find_arch_dir { 288: my ($input_node) = @_; 289: my $dir = ""; 290: 291: my $bib_id = $input_node->findvalue("fm:$arch_id_field"); 292: #logger('DEBUG', "bibdir: $bib_dir"); 293: if ($bib_id) { 294: $dir = "$lib_arch_dir/lit$bib_id"; 295: if (-d $dir) { 296: logger('DEBUG', "directory $dir exists"); 297: return $dir; 298: } 299: } 300: return; 301: } 302: 303: sub find_permanent_dir { 304: my ($input_node) = @_; 305: my $online_base = $lib_online_dir; 306: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field")); 307: if (! $dest_id) { 308: logger('ERROR', "no ID field for online permanent entry"); 309: $errcnt++; 310: return; 311: } 312: my $dir = "$online_base/lit$dest_id"; 313: if (-d $dir) { 314: logger('DEBUG', "directory $dir exists"); 315: return $dir; 316: } 317: return; 318: } 319: 320: 321: sub convert_bib { 322: my ($input_node, $index_root, $index_doc) = @_; 323: my $cnt = 0; 324: my $type = ""; 325: my $type_path = ""; 326: 327: # process general stuff first 328: foreach my $n ($input_node->getChildNodes()) { 329: my $name = $n->nodeName(); 330: my $val = $n->textContent(); 331: #logger('DEBUG', " NODE: $name = '$val'"); 332: if (exists $gen_map{$name}) { 333: # is a general field 334: if ($name eq $lang_field) { 335: # language field 336: if (not $val) { 337: logger('WARNING', "no language tag"); 338: $warncnt++; 339: next; 340: } 341: # convert to iso code 342: if (exists $lang_map{$val}) { 343: $val = $lang_map{$val}; 344: } else { 345: logger('ERROR', "unknown language: $val! skipping..."); 346: $errcnt++; 347: return 0; 348: } 349: } 350: create_element_path($gen_map{$name}, $index_root, $namespace) 351: ->appendTextNode($val); 352: $cnt++; 353: } elsif (exists $type_map{$name}) { 354: # is a type field 355: $type_path = $type_map{$name}; 356: $type = $val; 357: # check with known types 358: if (exists $subtype_map{$val}) { 359: my $indextype = $subtype_map{$val}->{'_name'}; 360: create_element_path("$type_path=$indextype", $index_root, $namespace); 361: $cnt++; 362: } else { 363: logger('ERROR', "unknown bib type $val! skipping..."); 364: $errcnt++; 365: return 0; 366: } 367: } 368: } 369: # process sub type fields 370: if ($type) { 371: foreach my $n ($input_node->getChildNodes()) { 372: my $name = $n->nodeName(); 373: my $val = $n->textContent(); 374: #logger('DEBUG', " NODE: $name = '$val'"); 375: if (exists $subtype_map{$type}->{$name}) { 376: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace) 377: ->appendTextNode($val); 378: $cnt++; 379: } 380: } 381: # append additional constant fields (beginning with #) 382: foreach my $k (keys %{$subtype_map{$type}}) { 383: if ($k =~ /^\#(.*)/) { 384: my $val = $1; 385: create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace); 386: } 387: } 388: } 389: return $cnt; 390: } 391: 392: 393: 394: sub process_all_fm_entries { 395: my ($input_root) = @_; 396: my $cnt = 0; 397: 398: foreach my $n ($input_root->findnodes('fm:ROW')) { 399: logger('INFO', "processing entry $cnt ..."); 400: process_fm_entry($n); 401: $cnt++; 402: } 403: } 404: 405: 406: sub process_fm_entry { 407: my ($input_node) = @_; 408: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8'); 409: my $index_root = $index_doc->createElementNS($namespace, 'resource'); 410: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1')); 411: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG')); 412: $index_doc->setDocumentElement($index_root); 413: 414: # try to find the document directory 415: my $doc_dir = ""; 416: if ($online_mode) { 417: $doc_dir = find_permanent_dir($input_node); 418: } elsif ($archive_mode) { 419: $doc_dir = find_arch_dir($input_node); 420: } else { 421: $doc_dir = find_permanent_dir($input_node); 422: } 423: if (! $doc_dir) { 424: logger('ERROR', "document directory not found! skipping..."); 425: $errcnt++; 426: return; 427: } 428: 429: # check if index.meta exists 430: if ( -f "$doc_dir/index.meta") { 431: if (not $do_replace) { 432: logger('DEBUG', "index file in $doc_dir exists"); 433: return; 434: } 435: } 436: 437: # add standard stuff to index.meta 438: my ($docname, $docpath) = split_file_path($doc_dir); 439: # name and date 440: create_text_path('name', $docname, $index_root, $namespace); 441: create_text_path('archive-path', $doc_dir, $index_root, $namespace); 442: create_text_path('archive-creation-date', stime(time), $index_root, $namespace); 443: create_text_path('creator', 'vlp', $index_root, $namespace); 444: create_text_path('description', 'a scanned document', $index_root, $namespace); 445: if ($archive_mode) { 446: # acquisition 447: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace); 448: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace); 449: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace); 450: } 451: # media 452: create_text_path('media-type', 'image', $index_root, $namespace); 453: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace); 454: # access 455: if ($access_type) { 456: if ($access_type eq "free") { 457: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); 458: } else { 459: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); 460: create_text_path('name', $access_type, $acc_tag, $namespace); 461: } 462: } elsif ($online_mode) { 463: # read access conditions from "online" field in DB dump 464: my $online = sstrip($input_node->findvalue("fm:$access_free_field")); 465: if ($online) { 466: create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace); 467: } else { 468: my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace); 469: create_text_path('name', 'mpiwg', $acc_tag, $namespace); 470: } 471: } 472: 473: # texttool tag with image dir 474: if ($online_mode && $texttool) { 475: if ( -d "$doc_dir/$texttool_img_dir" ) { 476: create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace); 477: } else { 478: logger('WARNING', "page image directory missing!"); 479: $warncnt++; 480: } 481: } 482: 483: # convert bib entries 484: my $cnt = convert_bib($input_node, $index_root, $index_doc); 485: if ($cnt == 0) { 486: # error or nothing to convert 487: logger('ERROR', "no bibliographic metadata!"); 488: $errcnt++; 489: return; 490: } 491: 492: # write new index.meta file 493: if ($dry_run) { 494: logger('DEBUG', "would write $doc_dir/index.meta"); 495: logger('DEBUG', $index_doc->toString(1)); 496: } else { 497: write_xml($index_doc, "$doc_dir/index.meta"); 498: } 499: 500: } 501: 502: 503: 504: 505: 506: ####################################################### 507: # Main 508: # 509: 510: # load filemaker xml dump 511: my ($input_doc, $input_root) = read_xml($infile); 512: # set namespace prefix 513: my $fm_namespace = $input_root->namespaceURI(); 514: $input_root->setNamespace($fm_namespace, 'fm', 1); 515: 516: 517: process_all_fm_entries($input_root); 518: 519: 520: logger("INFO", "$warncnt warnings"); 521: logger("INFO", "$errcnt errors"); 522: if ($errcnt > 0) { 523: logger("ABORT", "there were errors!"); 524: exit 1; 525: } else { 526: logger("DONE", "done something successfully!"); 527: } 528: