Annotation of foxridge-archiver/makemeta-lib.pl, revision 1.1
1.1 ! casties 1: #!/usr/local/bin/perl -w
! 2:
! 3: use strict;
! 4: use XML::LibXML;
! 5:
! 6: use lib '/usr/local/mpiwg/archive';
! 7: use MPIWGStor;
! 8:
! 9: # make output unbuffered
! 10: $|=1;
! 11:
! 12: #######################################################
! 13: # internal parameters
! 14: #
! 15:
! 16: # program version
! 17: my $version = "0.1.0 (24.5.2005)";
! 18: logger("INFO", "makemeta-lib $version");
! 19:
! 20: #
! 21: # mappings
! 22: #
! 23: # generic mappings at top level
! 24: my %gen_map = (
! 25: 'Device' => 'meta/image-acquisition/device',
! 26: 'Image_Type' => 'meta/image-acquisition/image-type',
! 27: 'Production_Comment' => 'meta/image-acquisition/production-comment',
! 28: 'Postproduction' => 'meta/image-acquisition/production-comment',
! 29: 'Language' => 'meta/lang'
! 30: );
! 31: # sub type switch tag
! 32: my %type_map = (
! 33: 'Reference_Type' => 'meta/bib@type'
! 34: );
! 35: # sub type mappings
! 36: my %subtype_map = (
! 37: 'Book' => {
! 38: '_name' => 'book',
! 39: 'Author' => 'meta/bib/author',
! 40: 'Title' => 'meta/bib/title',
! 41: 'Year' => 'meta/bib/year',
! 42: 'Place_Published' => 'meta/bib/city',
! 43: 'Publisher' => 'meta/bib/publisher',
! 44: 'Edition' => 'meta/bib/edition'
! 45: },
! 46: 'Journal Article' => {
! 47: '_name' => 'journal-article',
! 48: 'Author' => 'meta/bib/author',
! 49: 'Title' => 'meta/bib/title',
! 50: 'Year' => 'meta/bib/year',
! 51: 'Secondary_Title' => 'meta/bib/journal',
! 52: 'Volume' => 'meta/bib/volume',
! 53: 'Number' => 'meta/bib/issue',
! 54: 'Pages' => 'meta/bib/pages'
! 55: },
! 56: 'In Book' => {
! 57: '_name' => 'inbook',
! 58: 'Author' => 'meta/bib/author',
! 59: 'Title' => 'meta/bib/title',
! 60: 'Year' => 'meta/bib/year',
! 61: 'Secondary_Title' => 'meta/bib/book-title',
! 62: 'Pages' => 'meta/bib/pages'
! 63: },
! 64: 'Newspaper Article' => {
! 65: '_name' => 'newspaper-article',
! 66: 'Author' => 'meta/bib/author',
! 67: 'Title' => 'meta/bib/title',
! 68: 'Year' => 'meta/bib/year',
! 69: 'Secondary_Title' => 'meta/bib/newspaper',
! 70: 'Place_Published' => 'meta/bib/city',
! 71: 'Number' => 'meta/bib/issue-date',
! 72: 'Pages' => 'meta/bib/pages'
! 73: },
! 74: 'Edited Book' => {
! 75: '_name' => 'edited-book',
! 76: 'Author' => 'meta/bib/editor',
! 77: 'Title' => 'meta/bib/title',
! 78: 'Year' => 'meta/bib/year',
! 79: 'Place_Published' => 'meta/bib/city',
! 80: 'Publisher' => 'meta/bib/publisher',
! 81: 'Edition' => 'meta/bib/edition'
! 82: },
! 83: 'Manuscript' => {
! 84: '_name' => 'manuscript',
! 85: 'Author' => 'meta/bib/author',
! 86: 'Title' => 'meta/bib/title',
! 87: 'Year' => 'meta/bib/year',
! 88: 'Place_Published' => 'meta/bib/location',
! 89: }
! 90: );
! 91: # language element
! 92: my $lang_field = 'Language';
! 93: # languages to iso codes
! 94: my %lang_map = (
! 95: 'German' => 'de',
! 96: 'English' => 'en',
! 97: 'Italian' => 'it',
! 98: 'French' => 'fr',
! 99: 'Latin' => 'la'
! 100: );
! 101: # storage fields
! 102: my $arch_id_field = 'ID_Archive';
! 103: my $online_url_field = 'URL';
! 104:
! 105: # more storage
! 106: my $lib_arch_dir = '/mpiwg/archive/data/library';
! 107: my $lib_online_dir = '/mpiwg/online/permanent';
! 108:
! 109:
! 110: # read command line parameters
! 111: my $args = MPIWGStor::parseargs;
! 112:
! 113: # debug level
! 114: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
! 115:
! 116: # use einstein-cw mode
! 117: my $cw_mode = (exists $$args{'cw-mode'}) ? $$args{'cw-mode'} : 0;
! 118:
! 119: # index.meta namespace (not really implemented!)
! 120: my $namespace = "";
! 121:
! 122:
! 123: my $xml_changed = 0;
! 124: my $errcnt = 0;
! 125: my $warncnt = 0;
! 126:
! 127: #######################################################
! 128: # check parameters that were passed to the program
! 129: #
! 130: my $infile = $$args{'path'};
! 131: if (! $infile) {
! 132: logger("ABORT", "no input file given!");
! 133: exit 1;
! 134: }
! 135: # strip double slashes
! 136: $infile =~ s/\/\//\//;
! 137: if (! -f $infile) {
! 138: logger("ABORT", "input file \'$infile\' doesn't exist!");
! 139: exit 1;
! 140: }
! 141:
! 142:
! 143: #######################################################
! 144: # subroutines
! 145: #
! 146:
! 147: sub find_cw_dir {
! 148: my ($input_node) = @_;
! 149: my $src_dir = find_online_dir($input_node, '/mpiwg/archive/data/library/inbox/zwischen_backup');
! 150: my $dest_id = $input_node->findvalue("fm:$arch_id_field");
! 151: if (! $dest_id) {
! 152: logger('ERROR', "no ID field for einstein-cw entry");
! 153: $errcnt++;
! 154: return;
! 155: }
! 156: my $dir = "$lib_arch_dir/$dest_id";
! 157: logger('DEBUG', "moving $src_dir to $dir");
! 158: if (rename $src_dir, $dir) {
! 159: if (-d $dir) {
! 160: logger('DEBUG', "directory $dir OK");
! 161: return $dir;
! 162: }
! 163: } else {
! 164: logger('ABORT', "unable to rename directory $src_dir to $dir!");
! 165: exit 1;
! 166: }
! 167: return;
! 168: }
! 169:
! 170: sub find_online_dir {
! 171: my ($input_node, $base_dir) = @_;
! 172: $base_dir = $lib_online_dir unless ($base_dir);
! 173:
! 174: my $online_url = $input_node->findvalue("fm:$online_url_field");
! 175: if ($online_url =~ /fn=permanent\/(.+)\/pageimg/) {
! 176: my $online_dir = $1;
! 177: #logger("DEBUG", "dir: $base_dir/$online_dir");
! 178: my $dir = "$base_dir/$online_dir";
! 179: if (-d $dir) {
! 180: logger('DEBUG', "directory $dir exists");
! 181: return $dir;
! 182: }
! 183: }
! 184: return;
! 185: }
! 186:
! 187: sub find_arch_dir {
! 188: my ($input_node) = @_;
! 189: my $dir = "";
! 190:
! 191: my $bib_dir = $input_node->findvalue("fm:$arch_id_field");
! 192: #logger('DEBUG', "bibdir: $bib_dir");
! 193: if ($bib_dir) {
! 194: $dir = "$lib_arch_dir/$bib_dir";
! 195: if (-d $dir) {
! 196: logger('DEBUG', "directory $dir exists");
! 197: return $dir;
! 198: }
! 199: }
! 200: return;
! 201: }
! 202:
! 203:
! 204: sub convert_bib {
! 205: my ($input_node, $index_root, $index_doc) = @_;
! 206: my $cnt = 0;
! 207: my $type = "";
! 208: my $type_path = "";
! 209:
! 210: # process general stuff first
! 211: foreach my $n ($input_node->getChildNodes()) {
! 212: my $name = $n->nodeName();
! 213: my $val = $n->textContent();
! 214: #logger('DEBUG', " NODE: $name = '$val'");
! 215: if (exists $gen_map{$name}) {
! 216: # is a general field
! 217: if ($name eq $lang_field) {
! 218: # language field -> convert to iso code
! 219: if (exists $lang_map{$val}) {
! 220: $val = $lang_map{$val};
! 221: } else {
! 222: logger('ERROR', "unknown language: $val! skipping...");
! 223: $errcnt++;
! 224: return 0;
! 225: }
! 226: }
! 227: create_element_path($gen_map{$name}, $index_root, $namespace)
! 228: ->appendTextNode($val);
! 229: $cnt++;
! 230: } elsif (exists $type_map{$name}) {
! 231: # is a type field
! 232: $type_path = $type_map{$name};
! 233: $type = $val;
! 234: # check with known types
! 235: if (exists $subtype_map{$val}) {
! 236: my $indextype = $subtype_map{$val}->{'_name'};
! 237: create_element_path("$type_path=$indextype", $index_root, $namespace);
! 238: $cnt++;
! 239: } else {
! 240: logger('ERROR', 'unknown bib type $val! skipping...');
! 241: $errcnt++;
! 242: return 0;
! 243: }
! 244: }
! 245: }
! 246: # process sub type fields
! 247: if ($type) {
! 248: foreach my $n ($input_node->getChildNodes()) {
! 249: my $name = $n->nodeName();
! 250: my $val = $n->textContent();
! 251: #logger('DEBUG', " NODE: $name = '$val'");
! 252: if (exists $subtype_map{$type}->{$name}) {
! 253: create_element_path($subtype_map{$type}->{$name}, $index_root, $namespace)
! 254: ->appendTextNode($val);
! 255: $cnt++;
! 256: }
! 257: }
! 258: }
! 259: return $cnt;
! 260: }
! 261:
! 262:
! 263:
! 264: sub process_all_fm_entries {
! 265: my ($input_root) = @_;
! 266: my $cnt = 0;
! 267:
! 268: foreach my $n ($input_root->findnodes('fm:ROW')) {
! 269: logger('INFO', "processing entry $cnt ...");
! 270: process_fm_entry($n);
! 271: }
! 272: }
! 273:
! 274:
! 275: sub process_fm_entry {
! 276: my ($input_node) = @_;
! 277: my $index_doc = XML::LibXML::Document->createDocument('1.0', 'UTF-8');
! 278: my $index_root = $index_doc->createElementNS($namespace, 'resource');
! 279: $index_root->addChild($index_doc->createAttributeNS($namespace, 'version', '1.1'));
! 280: $index_root->addChild($index_doc->createAttributeNS($namespace, 'type', 'MPIWG'));
! 281: $index_doc->setDocumentElement($index_root);
! 282:
! 283: # try to find the document directory
! 284: my $doc_dir = "";
! 285: if ($cw_mode) {
! 286: $doc_dir = find_cw_dir($input_node);
! 287: } else {
! 288: $doc_dir = find_arch_dir($input_node);
! 289: }
! 290: if (! $doc_dir) {
! 291: logger('ERROR', "document directory not found! skipping...");
! 292: $errcnt++;
! 293: return;
! 294: }
! 295:
! 296: # add standard stuff to index.meta
! 297: my ($docname, $docpath) = split_file_path($doc_dir);
! 298: # name and date
! 299: create_text_path('name', $docname, $index_root, $namespace);
! 300: create_text_path('archive-path', $doc_dir, $index_root, $namespace);
! 301: create_text_path('archive-creation-date', stime(time), $index_root, $namespace);
! 302: create_text_path('creator', 'digigroup', $index_root, $namespace);
! 303: create_text_path('description', 'a scanned document', $index_root, $namespace);
! 304: # acquisition
! 305: create_text_path('meta/acquisition/date', stime(time), $index_root, $namespace);
! 306: create_text_path('meta/acquisition/provider/provider-id', 'digigroup', $index_root, $namespace);
! 307: create_text_path('meta/acquisition/provider/address', 'Max Planck Institute for the History of Science', $index_root, $namespace);
! 308: # media
! 309: create_text_path('media-type', 'image', $index_root, $namespace);
! 310: create_text_path('meta/content-type', 'scanned document', $index_root, $namespace);
! 311:
! 312: # convert bib entries
! 313: my $cnt = convert_bib($input_node, $index_root, $index_doc);
! 314: if ($cnt == 0) {
! 315: # error or nothing to convert
! 316: logger('ERROR', "no bibliographic metadata!");
! 317: $errcnt++;
! 318: return;
! 319: }
! 320:
! 321: # write new index.meta file
! 322: write_xml($index_doc, "$doc_dir/index.meta");
! 323:
! 324: }
! 325:
! 326:
! 327:
! 328:
! 329:
! 330: #######################################################
! 331: # Main
! 332: #
! 333:
! 334: # load filemaker xml dump
! 335: my ($input_doc, $input_root) = read_xml($infile);
! 336: # set namespace prefix
! 337: my $fm_namespace = $input_root->namespaceURI();
! 338: $input_root->setNamespace($fm_namespace, 'fm', 1);
! 339:
! 340: process_all_fm_entries($input_root);
! 341:
! 342:
! 343: logger("INFO", "$warncnt warnings");
! 344: logger("INFO", "$errcnt errors");
! 345: if ($errcnt > 0) {
! 346: logger("ABORT", "there were errors!");
! 347: exit 1;
! 348: } else {
! 349: logger("DONE", "done something successfully!");
! 350: }
! 351:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>