Annotation of foxridge-archiver/makemeta-vlp.pl, revision 1.10

1.10    ! casties     1: #!/usr/bin/perl -w
1.1       casties     2: 
                      3: use strict;
                      4: use XML::LibXML;
                      5: 
1.2       casties     6: use lib '/usr/local/mpiwg/archive';
1.1       casties     7: use MPIWGStor;
                      8: 
                      9: # make output unbuffered
                     10: $|=1;
                     11: 
                     12: # program version
1.9       casties    13: my $version = "0.2.7 (27.8.2010 ROC)";
1.1       casties    14: my $help = 
                     15: "use: makemeta-vlp [options] file.xml
                     16: options:
                     17:   -debug  show debugging info
                     18:   -dry-run  simulate, dont'do anything
                     19:   -replace  replace existing index files
                     20:   -online-mode  mode for creating online/permanent files
                     21:   -archive-mode  mode for creating archive/data files
1.6       casties    22:   -access=free  adds free access tag for online-mode
1.9       casties    23:   -texttool adds texttool tag for online-mode
1.1       casties    24: ";
                     25: logger("INFO", "makemeta-vlp $version");
                     26: 
                     27: ###########################################
                     28: # mappings
                     29: 
                     30: # generic mappings at top level
                     31: my %gen_map = (
1.6       casties    32:     'Custom2_Language' => 'meta/lang',
1.7       casties    33:     'productionComment' => 'meta/image-acquisition/production-comment',
1.6       casties    34:     'derivedFrom' => 'derived-from/archive-path'
1.1       casties    35:     );
                     36: # sub type switch tag
                     37: my %type_map = (
                     38:     'ReferenceType' => 'meta/bib@type'
                     39:     );
                     40: # sub type mappings
                     41: my %subtype_map = (
                     42:     'Book' => {
                     43:    '_name' => 'book',
                     44:    'Author' => 'meta/bib/author',
                     45:    'Title' => 'meta/bib/title',
                     46:    'Year' => 'meta/bib/year',
                     47:    'Place_Published' => 'meta/bib/city',
                     48:    'Publisher' => 'meta/bib/publisher',
                     49:    'Edition' => 'meta/bib/edition',
                     50:    'Volume' => 'meta/bib/volume',
                     51:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
                     52:    'Pages' => 'meta/bib/number-of-pages'
                     53:     },
1.3       casties    54:     '(Book)' => {
                     55:    '_name' => 'book',
                     56:    'Author' => 'meta/bib/author',
                     57:    'Title' => 'meta/bib/title',
                     58:    'Year' => 'meta/bib/year',
                     59:    'Place_Published' => 'meta/bib/city',
                     60:    'Publisher' => 'meta/bib/publisher',
                     61:    'Edition' => 'meta/bib/edition',
                     62:    'Volume' => 'meta/bib/volume',
                     63:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
                     64:    'Pages' => 'meta/bib/number-of-pages',
                     65:    '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
                     66:     },
1.1       casties    67:     'Book Section' => {
                     68:    '_name' => 'inbook',
                     69:    'Author' => 'meta/bib/author',
                     70:    'Title' => 'meta/bib/title',
                     71:    'Year' => 'meta/bib/year',
1.3       casties    72:    'SecondaryTitle' => 'meta/bib/book-title',
1.1       casties    73:    'SecondaryAuthor' => 'meta/bib/editor',
                     74:    'Volume' => 'meta/bib/volume',
                     75:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
                     76:    'Pages' => 'meta/bib/pages'
                     77:     },
                     78:     'Edited Book' => {
                     79:    '_name' => 'edited-book',
                     80:    'Author' => 'meta/bib/editor',
                     81:    'Title' => 'meta/bib/title',
                     82:    'Year' => 'meta/bib/year',
                     83:    'Place_Published' => 'meta/bib/city',
                     84:    'Publisher' => 'meta/bib/publisher',
                     85:    'Edition' => 'meta/bib/edition',
                     86:    'Volume' => 'meta/bib/volume',
                     87:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.4       casties    88:    'Pages' => 'meta/bib/number-of-pages',
                     89:    '#Cover pages only, book sections have been extracted' => 'meta/bib/comment'
                     90:     },
                     91:     '(Edited Book)' => {
                     92:    '_name' => 'edited-book',
                     93:    'Author' => 'meta/bib/editor',
                     94:    'Title' => 'meta/bib/title',
                     95:    'Year' => 'meta/bib/year',
                     96:    'Place_Published' => 'meta/bib/city',
                     97:    'Publisher' => 'meta/bib/publisher',
                     98:    'Edition' => 'meta/bib/edition',
                     99:    'Volume' => 'meta/bib/volume',
                    100:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
1.1       casties   101:    'Pages' => 'meta/bib/number-of-pages'
                    102:     },
                    103:     'Journal Article' => {
                    104:    '_name' => 'journal-article',
                    105:    'Author' => 'meta/bib/author',
                    106:    'Title' => 'meta/bib/title',
                    107:    'Year' => 'meta/bib/year',
                    108:    'SecondaryTitle' => 'meta/bib/journal',
                    109:    'Volume' => 'meta/bib/volume',
                    110:    'Number_Issue' => 'meta/bib/issue',
                    111:    'Pages' => 'meta/bib/pages'
                    112:     },
1.5       casties   113:     '(JournalVolume)' => {
                    114:    '_name' => 'journal-volume',
                    115:    'SecondaryTitle' => 'meta/bib/title',
                    116:    'SecondaryAuthor' => 'meta/bib/editor',
                    117:    'Publisher' => 'meta/bib/publisher',
                    118:    'Place_Published' => 'meta/bib/city',
                    119:    'Year' => 'meta/bib/year',
                    120:    'Volume' => 'meta/bib/volume',
                    121:    'Pages' => 'meta/bib/number-of-pages',
                    122:    '#Cover pages only, articles have been extracted' => 'meta/bib/comment'
                    123:     },
1.9       casties   124:     'Journal' => {
                    125:    '_name' => 'report',
                    126:    'Title' => 'meta/bib/title',
                    127:    'SecondaryTitle' => 'meta/bib/institution',
                    128:    'Author' => 'meta/bib/author',
                    129:    'Place_Published' => 'meta/bib/city',
                    130:    'Year' => 'meta/bib/year',
                    131:    'Date' => 'meta/bib/date',
                    132:    'Pages' => 'meta/bib/pages',
                    133:     },
1.1       casties   134:     'Magazine Article' => {
                    135:    '_name' => 'magazine-article',
                    136:    'Author' => 'meta/bib/author',
                    137:    'Title' => 'meta/bib/title',
                    138:    'Year' => 'meta/bib/year',
                    139:    'Secondary_Title' => 'meta/bib/magazine',
                    140:    'Number_Issue' => 'meta/bib/issue-number',
                    141:    'Date' => 'meta/bib/issue-date',
                    142:    'Pages' => 'meta/bib/pages'
                    143:     },
1.9       casties   144:     'Newspaper Article' => {
                    145:    '_name' => 'newspaper-article',
                    146:    'Author' => 'meta/bib/author',
                    147:    'Title' => 'meta/bib/title',
                    148:    'Year' => 'meta/bib/year',
                    149:    'Secondary_Title' => 'meta/bib/newspaper',
                    150:    'Date' => 'meta/bib/issue-date',
                    151:    'Pages' => 'meta/bib/pages'
                    152:     },
1.1       casties   153:     'Report' => {
                    154:    '_name' => 'report',
                    155:    'Author' => 'meta/bib/author',
                    156:    'Title' => 'meta/bib/title',
                    157:    'Year' => 'meta/bib/year',
                    158:    'Place_Published' => 'meta/bib/city',
                    159:    'Date' => 'meta/bib/date',
                    160:    'SecondaryTitle' => 'meta/bib/type',
                    161:    'Pages' => 'meta/bib/pages'
                    162:     },
                    163:     'Trade Catalogue' => {
                    164:    '_name' => 'report',
                    165:    'Author' => 'meta/bib/author',
                    166:    'Title' => 'meta/bib/title',
                    167:    'Year' => 'meta/bib/year',
                    168:    'Place_Published' => 'meta/bib/city',
                    169:    'Date' => 'meta/bib/date',
                    170:    'Volume' => 'meta/bib/volume',
                    171:    'NumberOfVolumes' => 'meta/bib/number-of-volumes',
                    172:    'ReferenceType' => 'meta/bib/type',
                    173:    'Pages' => 'meta/bib/pages'
                    174:     },
                    175:     'Thesis' => {
                    176:    '_name' => 'thesis',
                    177:    'Author' => 'meta/bib/author',
                    178:    'Title' => 'meta/bib/title',
                    179:    'Place_Published' => 'meta/bib/city',
                    180:    'Publisher' => 'meta/bib/university',
                    181:    'Date' => 'meta/bib/date',
                    182:    'TypeOfWork' => 'meta/bib/type',
                    183:    'Pages' => 'meta/bib/number-of-pages'
                    184:     },
                    185:     'Manuscript' => {
                    186:    '_name' => 'manuscript',
                    187:    'Author' => 'meta/bib/author',
                    188:    'Title' => 'meta/bib/title',
                    189:    'Year' => 'meta/bib/year',
                    190:    'Place_Published' => 'meta/bib/location',
                    191:    'Pages' => 'meta/bib/pages'
                    192:     }
                    193:     );
                    194: # language element
                    195: my $lang_field = 'Custom2_Language';
                    196: # languages to iso codes
                    197: my %lang_map = (
                    198:     'German' => 'de',
                    199:     'English' => 'en',
                    200:     'Italian' => 'it',
                    201:     'French' => 'fr',
                    202:     'Latin' => 'la',
                    203:     'Japanese' => 'ja',
                    204:     'Dutch' => 'nl',
                    205:     'Spanish' => 'es',
1.8       casties   206:     'Swedish' => 'sv',
                    207:     'Russian' => 'ru',
                    208:     'Polish' => 'pl',
                    209:     'Greek' => 'el'
1.1       casties   210:     );
                    211: # storage fields
                    212: my $arch_id_field = 'ID';
1.9       casties   213: my $access_free_field = 'online';
1.1       casties   214: 
                    215: #######################################################
                    216: # internal parameters
                    217: #
                    218: 
                    219: # storage
                    220: my $lib_arch_dir = '/mpiwg/archive/data/vlp';
                    221: my $lib_online_dir = '/mpiwg/online/permanent/vlp';
                    222: 
                    223: # read command line parameters
                    224: my $args = MPIWGStor::parseargs;
                    225: if (! scalar(%$args)) {
                    226:     print $help, "\n";
                    227:     exit 1;
                    228: }
                    229: 
                    230: # debug level
                    231: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
                    232: 
                    233: # simulate action only
                    234: my $dry_run = (exists $$args{'dry-run'}) ? $$args{'dry-run'} : 0;
                    235: logger('DEBUG', "dry-run: $dry_run");
                    236: 
                    237: # replace existing index files
                    238: my $do_replace = (exists $$args{'replace'}) ? $$args{'replace'} : 0;
                    239: logger('DEBUG', "replace: $do_replace");
                    240: 
                    241: # use online mode
                    242: my $online_mode = (exists $$args{'online-mode'}) ? $$args{'online-mode'} : 0;
                    243: logger('DEBUG', "online_mode: $online_mode");
                    244: 
                    245: # use archive mode
                    246: my $archive_mode = (exists $$args{'archive-mode'}) ? $$args{'archive-mode'} : 0;
                    247: logger('DEBUG', "archive_mode: $archive_mode");
                    248: 
1.9       casties   249: # create texttool tag (online mode only)
                    250: my $texttool = (exists $$args{'texttool'}) ? $$args{'texttool'} : 1;
                    251: logger('DEBUG', "texttool: $texttool");
                    252: # image dir for texttool
                    253: my $texttool_img_dir = "pages";
                    254: 
1.6       casties   255: # access type
                    256: my $access_type = (exists $$args{'access'}) ? $$args{'access'} : "";
                    257: 
1.1       casties   258: # index.meta namespace (not really implemented!)
                    259: my $namespace = "";
                    260: 
                    261: 
                    262: my $xml_changed = 0;
                    263: my $errcnt = 0;
                    264: my $warncnt = 0;
                    265: 
                    266: #######################################################
                    267: # check parameters that were passed to the program
                    268: #
                    269: my $infile = $$args{'path'};
                    270: if (! $infile) {
                    271:     logger("ABORT", "no input file given!");
                    272:     exit 1;
                    273: }
                    274: # strip double slashes
                    275: $infile =~ s/\/\//\//;
                    276: if (! -f $infile) {
                    277:     logger("ABORT", "input file \'$infile\' doesn't exist!");
                    278:     exit 1;
                    279: }
                    280: 
                    281: 
                    282: #######################################################
                    283: # subroutines
                    284: #
                    285: 
                    286: 
                    287: sub find_arch_dir {
                    288:     my ($input_node) = @_;
                    289:     my $dir = "";
                    290: 
                    291:     my $bib_id = $input_node->findvalue("fm:$arch_id_field");
                    292:     #logger('DEBUG', "bibdir: $bib_dir");
                    293:     if ($bib_id) {
                    294:    $dir = "$lib_arch_dir/lit$bib_id";
                    295:    if (-d $dir) {
                    296:        logger('DEBUG', "directory $dir exists"); 
                    297:        return $dir;
                    298:    }
                    299:     }
                    300:     return;
                    301: }
                    302: 
                    303: sub find_permanent_dir {
                    304:     my ($input_node) = @_;
1.6       casties   305:     my $online_base = $lib_online_dir;
1.1       casties   306:     my $dest_id = sstrip($input_node->findvalue("fm:$arch_id_field"));
                    307:     if (! $dest_id) {
                    308:    logger('ERROR', "no ID field for online permanent entry");
                    309:    $errcnt++;
                    310:    return;
                    311:     }
                    312:     my $dir = "$online_base/lit$dest_id";
1.9       casties   313:     if (-d $dir) {
                    314:         logger('DEBUG', "directory $dir exists"); 
                    315:         return $dir;
                    316:     }
                    317:     return;
1.1       casties   318: }
                    319: 
                    320: 
                    321: sub convert_bib {
                    322:     my ($input_node, $index_root, $index_doc) = @_;
                    323:     my $cnt = 0;
                    324:     my $type = "";
                    325:     my $type_path = "";
                    326: 
                    327:     # process general stuff first
                    328:     foreach my $n ($input_node->getChildNodes()) {
                    329:    my $name = $n->nodeName();
                    330:    my $val = $n->textContent();
                    331:    #logger('DEBUG', "  NODE: $name = '$val'");
                    332:    if (exists $gen_map{$name}) {
                    333:        # is a general field
                    334:        if ($name eq $lang_field) {
                    335:        # language field
                    336:        if (not $val) {
                    337:            logger('WARNING', "no language tag");
                    338:            $warncnt++;
                    339:            next;
                    340:        }
                    341:        # convert to iso code
                    342:        if (exists $lang_map{$val}) {
                    343:            $val = $lang_map{$val};
                    344:        } else {
                    345:            logger('ERROR', "unknown language: $val! skipping...");
                    346:            $errcnt++;
                    347:            return 0;
                    348:        }
                    349:        }
                    350:        create_element_path($gen_map{$name}, $index_root, $namespace)
                    351:        ->appendTextNode($val);
                    352:        $cnt++;
                    353:    } elsif (exists $type_map{$name}) {
                    354:        # is a type field
                    355:        $type_path = $type_map{$name};
                    356:        $type = $val;
                    357:        # check with known types
                    358:        if (exists $subtype_map{$val}) {
                    359:        my $indextype = $subtype_map{$val}->{'_name'};
                    360:        create_element_path("$type_path=$indextype", $index_root, $namespace);
                    361:        $cnt++;
                    362:        } else {
                    363:        logger('ERROR', "unknown bib type $val! skipping...");
                    364:        $errcnt++;
                    365:        return 0;
                    366:        }
                    367:    }
                    368:     }
                    369:     # process sub type fields
                    370:     if ($type) {
                    371:    foreach my $n ($input_node->getChildNodes()) {
                    372:        my $name = $n->nodeName();
                    373:        my $val = $n->textContent();
                    374:        #logger('DEBUG', "  NODE: $name = '$val'");
                    375:        if (exists $subtype_map{$type}->{$name}) {
                    376:        create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
                    377:            ->appendTextNode($val);
                    378:        $cnt++;
                    379:        }
                    380:    }
1.3       casties   381:    # append additional constant fields (beginning with #)
                    382:    foreach my $k (keys %{$subtype_map{$type}}) {
                    383:        if ($k =~ /^\#(.*)/) {
                    384:        my $val = $1;
                    385:        create_text_path($subtype_map{$type}->{$k}, $val, $index_root, $namespace);
                    386:        }
                    387:    }
1.1       casties   388:     }
                    389:     return $cnt;
                    390: }
                    391: 
                    392: 
                    393: 
                    394: sub process_all_fm_entries {
                    395:     my ($input_root) = @_;
                    396:     my $cnt = 0;
                    397: 
                    398:     foreach my $n ($input_root->findnodes('fm:ROW')) {
                    399:    logger('INFO', "processing entry $cnt ...");
                    400:    process_fm_entry($n);
                    401:    $cnt++;
                    402:     }
                    403: }    
                    404: 
                    405: 
                    406: sub process_fm_entry {
                    407:     my ($input_node) = @_;
                    408:     my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
                    409:     my $index_root = $index_doc->createElementNS($namespace, 'resource');
                    410:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
                    411:     $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
                    412:     $index_doc->setDocumentElement($index_root);
                    413: 
                    414:     # try to find the document directory
                    415:     my $doc_dir = "";
                    416:     if ($online_mode) {
                    417:    $doc_dir = find_permanent_dir($input_node);
                    418:     } elsif ($archive_mode) {
                    419:    $doc_dir = find_arch_dir($input_node);
                    420:     } else {
                    421:    $doc_dir = find_permanent_dir($input_node);
                    422:     }
                    423:     if (! $doc_dir) {
                    424:    logger('ERROR', "document directory not found! skipping...");
                    425:    $errcnt++;
                    426:    return;
                    427:     }
                    428: 
                    429:     # check if index.meta exists
                    430:     if ( -f "$doc_dir/index.meta") {
                    431:    if (not $do_replace) {
                    432:        logger('DEBUG', "index file in $doc_dir exists");
                    433:        return;
                    434:    }
                    435:     }
                    436: 
                    437:     # add standard stuff to index.meta
                    438:     my ($docname, $docpath) = split_file_path($doc_dir);
                    439:     # name and date
                    440:     create_text_path('name', $docname, $index_root, $namespace);
                    441:     create_text_path('archive-path', $doc_dir, $index_root, $namespace);
                    442:     create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
                    443:     create_text_path('creator', 'vlp', $index_root, $namespace);
                    444:     create_text_path('description', 'a scanned document', $index_root, $namespace);
                    445:     if ($archive_mode) {
                    446:       # acquisition
                    447:       create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
                    448:       create_text_path('meta/acquisition/provider/provider-id', 'vlp', $index_root, $namespace);
                    449:       create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
                    450:     }
                    451:     # media
                    452:     create_text_path('media-type', 'image', $index_root, $namespace);
                    453:     create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
1.6       casties   454:     # access
                    455:     if ($access_type) {
                    456:    if ($access_type eq "free") {
                    457:        create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
                    458:    } else {
                    459:        my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
                    460:        create_text_path('name', $access_type, $acc_tag, $namespace);
                    461:    }
1.9       casties   462:     } elsif ($online_mode) {
                    463:         # read access conditions from "online" field in DB dump
                    464:         my $online = sstrip($input_node->findvalue("fm:$access_free_field"));
                    465:         if ($online) {
                    466:        create_element_path('meta/access-conditions/access@type=free', $index_root, $namespace);
                    467:    } else {
                    468:        my $acc_tag = create_element_path('meta/access-conditions/access@type=institution', $index_root, $namespace);
                    469:        create_text_path('name', 'mpiwg', $acc_tag, $namespace);
                    470:    }
                    471:     }
                    472: 
                    473:     # texttool tag with image dir
                    474:     if ($online_mode && $texttool) {
                    475:    if ( -d "$doc_dir/$texttool_img_dir" ) {
                    476:        create_text_path('meta/texttool/image', $texttool_img_dir,$index_root, $namespace);
                    477:    } else {
                    478:             logger('WARNING', "page image directory missing!");
                    479:             $warncnt++;
                    480:         }
1.6       casties   481:     }
1.1       casties   482: 
                    483:     # convert bib entries
                    484:     my $cnt = convert_bib($input_node, $index_root, $index_doc);
                    485:     if ($cnt == 0) {
                    486:    # error or nothing to convert
                    487:    logger('ERROR', "no bibliographic metadata!");
                    488:    $errcnt++;
                    489:    return;
                    490:     }
                    491: 
                    492:     # write new index.meta file
                    493:     if ($dry_run) {
                    494:    logger('DEBUG', "would write $doc_dir/index.meta");
                    495:    logger('DEBUG', $index_doc->toString(1));
                    496:     } else {
                    497:    write_xml($index_doc, "$doc_dir/index.meta");
                    498:     }
                    499: 
                    500: }
                    501: 
                    502: 
                    503: 
                    504: 
                    505: 
                    506: #######################################################
                    507: # Main
                    508: #
                    509: 
                    510: # load filemaker xml dump
                    511: my ($input_doc, $input_root) = read_xml($infile);
                    512: # set namespace prefix
                    513: my $fm_namespace = $input_root->namespaceURI();
                    514: $input_root->setNamespace($fm_namespace, 'fm', 1);
                    515: 
                    516: 
                    517: process_all_fm_entries($input_root);
                    518: 
                    519: 
                    520: logger("INFO", "$warncnt warnings");
                    521: logger("INFO", "$errcnt errors");
                    522: if ($errcnt > 0) {
                    523:     logger("ABORT", "there were errors!");
                    524:     exit 1;
                    525: } else {
                    526:     logger("DONE", "done something successfully!");
                    527: }
                    528: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>