Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.1
1.1 ! casties 1: #!/usr/local/bin/perl -w
! 2:
! 3: use strict;
! 4: use XML::LibXML;
! 5:
! 6: use lib '/usr/local/mpiwg/archive_devel';
! 7: use MPIWGStor;
! 8:
! 9: # make output unbuffered
! 10: $|=1;
! 11:
! 12: # program version
! 13: my $version = "0.2 (19.9.2005 ROC)";
! 14: my $help =
! 15: "use: makemeta-vlp [options] file.xml
! 16: options:
! 17: -debug show debugging info
! 18: -dry-run simulate, dont'do anything
! 19: -replace replace existing index files
! 20: -online-mode mode for creating online/permanent files
! 21: -archive-mode mode for creating archive/data files
! 22: ";
! 23: logger("INFO", "makemeta-vlp $version");
! 24:
! 25: ###########################################
! 26: # mappings
! 27:
! 28: # generic mappings at top level
! 29: my %gen_map = (
! 30: 'Custom2_Language' => 'meta/lang'
! 31: );
! 32: # sub type switch tag
! 33: my %type_map = (
! 34: 'ReferenceType' => 'meta/bib@type'
! 35: );
! 36: # sub type mappings
! 37: my %subtype_map = (
! 38: 'Book' => {
! 39: '_name' => 'book',
! 40: 'Author' => 'meta/bib/author',
! 41: 'Title' => 'meta/bib/title',
! 42: 'Year' => 'meta/bib/year',
! 43: 'Place_Published' => 'meta/bib/city',
! 44: 'Publisher' => 'meta/bib/publisher',
! 45: 'Edition' => 'meta/bib/edition',
! 46: 'Volume' => 'meta/bib/volume',
! 47: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
! 48: 'Pages' => 'meta/bib/number-of-pages'
! 49: },
! 50: 'Book Section' => {
! 51: '_name' => 'inbook',
! 52: 'Author' => 'meta/bib/author',
! 53: 'Title' => 'meta/bib/title',
! 54: 'Year' => 'meta/bib/year',
! 55: 'Secondary_Title' => 'meta/bib/book-title',
! 56: 'SecondaryAuthor' => 'meta/bib/editor',
! 57: 'Volume' => 'meta/bib/volume',
! 58: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
! 59: 'Pages' => 'meta/bib/pages'
! 60: },
! 61: 'Edited Book' => {
! 62: '_name' => 'edited-book',
! 63: 'Author' => 'meta/bib/editor',
! 64: 'Title' => 'meta/bib/title',
! 65: 'Year' => 'meta/bib/year',
! 66: 'Place_Published' => 'meta/bib/city',
! 67: 'Publisher' => 'meta/bib/publisher',
! 68: 'Edition' => 'meta/bib/edition',
! 69: 'Volume' => 'meta/bib/volume',
! 70: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
! 71: 'Pages' => 'meta/bib/number-of-pages'
! 72: },
! 73: 'Journal Article' => {
! 74: '_name' => 'journal-article',
! 75: 'Author' => 'meta/bib/author',
! 76: 'Title' => 'meta/bib/title',
! 77: 'Year' => 'meta/bib/year',
! 78: 'SecondaryTitle' => 'meta/bib/journal',
! 79: 'Volume' => 'meta/bib/volume',
! 80: 'Number_Issue' => 'meta/bib/issue',
! 81: 'Pages' => 'meta/bib/pages'
! 82: },
! 83: 'Magazine Article' => {
! 84: '_name' => 'magazine-article',
! 85: 'Author' => 'meta/bib/author',
! 86: 'Title' => 'meta/bib/title',
! 87: 'Year' => 'meta/bib/year',
! 88: 'Secondary_Title' => 'meta/bib/magazine',
! 89: 'Number_Issue' => 'meta/bib/issue-number',
! 90: 'Date' => 'meta/bib/issue-date',
! 91: 'Pages' => 'meta/bib/pages'
! 92: },
! 93: 'Report' => {
! 94: '_name' => 'report',
! 95: 'Author' => 'meta/bib/author',
! 96: 'Title' => 'meta/bib/title',
! 97: 'Year' => 'meta/bib/year',
! 98: 'Place_Published' => 'meta/bib/city',
! 99: 'Date' => 'meta/bib/date',
! 100: 'SecondaryTitle' => 'meta/bib/type',
! 101: 'Pages' => 'meta/bib/pages'
! 102: },
! 103: 'Trade Catalogue' => {
! 104: '_name' => 'report',
! 105: 'Author' => 'meta/bib/author',
! 106: 'Title' => 'meta/bib/title',
! 107: 'Year' => 'meta/bib/year',
! 108: 'Place_Published' => 'meta/bib/city',
! 109: 'Date' => 'meta/bib/date',
! 110: 'Volume' => 'meta/bib/volume',
! 111: 'NumberOfVolumes' => 'meta/bib/number-of-volumes',
! 112: 'ReferenceType' => 'meta/bib/type',
! 113: 'Pages' => 'meta/bib/pages'
! 114: },
! 115: 'Thesis' => {
! 116: '_name' => 'thesis',
! 117: 'Author' => 'meta/bib/author',
! 118: 'Title' => 'meta/bib/title',
! 119: 'Place_Published' => 'meta/bib/city',
! 120: 'Publisher' => 'meta/bib/university',
! 121: 'Date' => 'meta/bib/date',
! 122: 'TypeOfWork' => 'meta/bib/type',
! 123: 'Pages' => 'meta/bib/number-of-pages'
! 124: },
! 125: 'Manuscript' => {
! 126: '_name' => 'manuscript',
! 127: 'Author' => 'meta/bib/author',
! 128: 'Title' => 'meta/bib/title',
! 129: 'Year' => 'meta/bib/year',
! 130: 'Place_Published' => 'meta/bib/location',
! 131: 'Pages' => 'meta/bib/pages'
! 132: }
! 133: );
! 134: # language element
! 135: my $lang_field = 'Custom2_Language';
! 136: # languages to iso codes
! 137: my %lang_map = (
! 138: 'German' => 'de',
! 139: 'English' => 'en',
! 140: 'Italian' => 'it',
! 141: 'French' => 'fr',
! 142: 'Latin' => 'la',
! 143: 'Japanese' => 'ja',
! 144: 'Dutch' => 'nl',
! 145: 'Spanish' => 'es',
! 146: 'Swedish' => 'sv'
! 147: );
! 148: # storage fields
! 149: my $arch_id_field = 'ID';
! 150:
! 151: #######################################################
! 152: # internal parameters
! 153: #
! 154:
! 155: # storage
! 156: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
! 157: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
! 158:
! 159: # read command line parameters
! 160: my $args = MPIWGStor::parseargs;
! 161: if (! scalar(%$args)) {
! 162: print $help, "\n";
! 163: exit 1;
! 164: }
! 165:
! 166: # debug level
! 167: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
! 168:
! 169: # simulate action only
! 170: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
! 171: logger('DEBUG', "dry-run: $dry_run");
! 172:
! 173: # replace existing index files
! 174: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
! 175: logger('DEBUG', "replace: $do_replace");
! 176:
! 177: # use online mode
! 178: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
! 179: logger('DEBUG', "online_mode: $online_mode");
! 180:
! 181: # use archive mode
! 182: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
! 183: logger('DEBUG', "archive_mode: $archive_mode");
! 184:
! 185: # index.meta namespace (not really implemented!)
! 186: my $namespace = "";
! 187:
! 188:
! 189: my $xml_changed = 0;
! 190: my $errcnt = 0;
! 191: my $warncnt = 0;
! 192:
! 193: #######################################################
! 194: # check parameters that were passed to the program
! 195: #
! 196: my $infile = $$args{'path'};
! 197: if (! $infile) {
! 198: logger("ABORT", "no input file given!");
! 199: exit 1;
! 200: }
! 201: # strip double slashes
! 202: $infile =~ s/\/\//\//;
! 203: if (! -f $infile) {
! 204: logger("ABORT", "input file \'$infile\' doesn't exist!");
! 205: exit 1;
! 206: }
! 207:
! 208:
! 209: #######################################################
! 210: # subroutines
! 211: #
! 212:
! 213:
! 214: sub find_arch_dir {
! 215: my ($input_node) = @_;
! 216: my $dir = "";
! 217:
! 218: my $bib_id = $input_node->findvalue("fm:$arch_id_field");
! 219: #logger('DEBUG', "bibdir: $bib_dir");
! 220: if ($bib_id) {
! 221: $dir = "$lib_arch_dir/lit$bib_id";
! 222: if (-d $dir) {
! 223: logger('DEBUG', "directory $dir exists");
! 224: return $dir;
! 225: }
! 226: }
! 227: return;
! 228: }
! 229:
! 230: sub find_permanent_dir {
! 231: my ($input_node) = @_;
! 232: my $online_base = '/mpiwg/online/permanent';
! 233: my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
! 234: if (! $dest_id) {
! 235: logger('ERROR', "no ID field for online permanent entry");
! 236: $errcnt++;
! 237: return;
! 238: }
! 239: my $dir = "$online_base/lit$dest_id";
! 240: return $dir;
! 241: }
! 242:
! 243:
! 244: sub convert_bib {
! 245: my ($input_node, $index_root, $index_doc) = @_;
! 246: my $cnt = 0;
! 247: my $type = "";
! 248: my $type_path = "";
! 249:
! 250: # process general stuff first
! 251: foreach my $n ($input_node->getChildNodes()) {
! 252: my $name = $n->nodeName();
! 253: my $val = $n->textContent();
! 254: #logger('DEBUG', " NODE: $name = '$val'");
! 255: if (exists $gen_map{$name}) {
! 256: # is a general field
! 257: if ($name eq $lang_field) {
! 258: # language field
! 259: if (not $val) {
! 260: logger('WARNING', "no language tag");
! 261: $warncnt++;
! 262: next;
! 263: }
! 264: # convert to iso code
! 265: if (exists $lang_map{$val}) {
! 266: $val = $lang_map{$val};
! 267: } else {
! 268: logger('ERROR', "unknown language: $val! skipping...");
! 269: $errcnt++;
! 270: return 0;
! 271: }
! 272: }
! 273: create_element_path($gen_map{$name}, $index_root, $namespace)
! 274: ->appendTextNode($val);
! 275: $cnt++;
! 276: } elsif (exists $type_map{$name}) {
! 277: # is a type field
! 278: $type_path = $type_map{$name};
! 279: $type = $val;
! 280: # check with known types
! 281: if (exists $subtype_map{$val}) {
! 282: my $indextype = $subtype_map{$val}->{'_name'};
! 283: create_element_path("$type_path=$indextype", $index_root, $namespace);
! 284: $cnt++;
! 285: } else {
! 286: logger('ERROR', "unknown bib type $val! skipping...");
! 287: $errcnt++;
! 288: return 0;
! 289: }
! 290: }
! 291: }
! 292: # process sub type fields
! 293: if ($type) {
! 294: foreach my $n ($input_node->getChildNodes()) {
! 295: my $name = $n->nodeName();
! 296: my $val = $n->textContent();
! 297: #logger('DEBUG', " NODE: $name = '$val'");
! 298: if (exists $subtype_map{$type}->{$name}) {
! 299: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
! 300: ->appendTextNode($val);
! 301: $cnt++;
! 302: }
! 303: }
! 304: }
! 305: return $cnt;
! 306: }
! 307:
! 308:
! 309:
! 310: sub process_all_fm_entries {
! 311: my ($input_root) = @_;
! 312: my $cnt = 0;
! 313:
! 314: foreach my $n ($input_root->findnodes('fm:ROW')) {
! 315: logger('INFO', "processing entry $cnt ...");
! 316: process_fm_entry($n);
! 317: $cnt++;
! 318: }
! 319: }
! 320:
! 321:
! 322: sub process_fm_entry {
! 323: my ($input_node) = @_;
! 324: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
! 325: my $index_root = $index_doc->createElementNS($namespace, 'resource');
! 326: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
! 327: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
! 328: $index_doc->setDocumentElement($index_root);
! 329:
! 330: # try to find the document directory
! 331: my $doc_dir = "";
! 332: if ($online_mode) {
! 333: $doc_dir = find_permanent_dir($input_node);
! 334: } elsif ($archive_mode) {
! 335: $doc_dir = find_arch_dir($input_node);
! 336: } else {
! 337: $doc_dir = find_permanent_dir($input_node);
! 338: }
! 339: if (! $doc_dir) {
! 340: logger('ERROR', "document directory not found! skipping...");
! 341: $errcnt++;
! 342: return;
! 343: }
! 344:
! 345: # check if index.meta exists
! 346: if ( -f "$doc_dir/index.meta") {
! 347: if (not $do_replace) {
! 348: logger('DEBUG', "index file in $doc_dir exists");
! 349: return;
! 350: }
! 351: }
! 352:
! 353: # add standard stuff to index.meta
! 354: my ($docname, $docpath) = split_file_path($doc_dir);
! 355: # name and date
! 356: create_text_path('name', $docname, $index_root, $namespace);
! 357: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
! 358: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
! 359: create_text_path('creator', 'vlp', $index_root, $namespace);
! 360: create_text_path('description', 'a scanned document', $index_root, $namespace);
! 361: if ($archive_mode) {
! 362: # acquisition
! 363: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
! 364: create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
! 365: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
! 366: # image acquisition
! 367: create_text_path('meta/image-acquisition/device', 'Flatbed Scanner' , $index_root, $namespace);
! 368: create_text_path('meta/image-acquisition/image-type', 'Greyscale' , $index_root, $namespace);
! 369: create_text_path('meta/image-acquisition/production-comment', 'Raw scans in \'raw\' folder, cleaned pages in \'pages\' folder.' , $index_root, $namespace);
! 370: }
! 371: # media
! 372: create_text_path('media-type', 'image', $index_root, $namespace);
! 373: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
! 374:
! 375: # convert bib entries
! 376: my $cnt = convert_bib($input_node, $index_root, $index_doc);
! 377: if ($cnt == 0) {
! 378: # error or nothing to convert
! 379: logger('ERROR', "no bibliographic metadata!");
! 380: $errcnt++;
! 381: return;
! 382: }
! 383:
! 384: # write new index.meta file
! 385: if ($dry_run) {
! 386: logger('DEBUG', "would write $doc_dir/index.meta");
! 387: logger('DEBUG', $index_doc->toString(1));
! 388: } else {
! 389: write_xml($index_doc, "$doc_dir/index.meta");
! 390: }
! 391:
! 392: }
! 393:
! 394:
! 395:
! 396:
! 397:
! 398: #######################################################
! 399: # Main
! 400: #
! 401:
! 402: # load filemaker xml dump
! 403: my ($input_doc, $input_root) = read_xml($infile);
! 404: # set namespace prefix
! 405: my $fm_namespace = $input_root->namespaceURI();
! 406: $input_root->setNamespace($fm_namespace, 'fm', 1);
! 407:
! 408:
! 409: process_all_fm_entries($input_root);
! 410:
! 411:
! 412: logger("INFO", "$warncnt warnings");
! 413: logger("INFO", "$errcnt errors");
! 414: if ($errcnt > 0) {
! 415: logger("ABORT", "there were errors!");
! 416: exit 1;
! 417: } else {
! 418: logger("DONE", "done something successfully!");
! 419: }
! 420:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>