Annotation of foxridge-archiver/harvestmeta.pl, revision 1.6

1.6     ! casties     1: #!/usr/bin/perl -w
1.1       casties     2: 
                      3: use strict;
                      4: use XML::SAX;
1.2       casties     5: use XML::LibXML;
1.1       casties     6: use DBI;
                      7: 
1.3       casties     8: use lib '/usr/local/mpiwg/archive';
1.1       casties     9: use MPIWGStor;
                     10: use HarvestmetaHandler;
                     11: 
                     12: # make output unbuffered
                     13: $|=1;
                     14: 
                     15: #######################################################
                     16: # internal parameters
                     17: #
                     18: 
                     19: # program version
1.5       casties    20: my $version = "0.3 (27.9.2004)";
1.1       casties    21: 
                     22: # read command line parameters
                     23: my $args = MPIWGStor::parseargs;
                     24: 
1.2       casties    25: if (! scalar(%$args)) {
                     26:     print "harvestmeta $version\n";
                     27:     print "use: harvestmeta -path=dir\n";
                     28:     print "  reads all metadata info from directory dir into the database\n";
                     29:     print "alternative sources:\n";
                     30:     print "  -indexurl=url : read XML index and follow links\n";
                     31:     print "  -singleurl=url : read single index file\n";
                     32:     print "additional options:\n";
                     33:     print "  -baseurl=url : clean all URL sources relative to this base\n";
                     34:     print "  -debug : output debugging info\n";
                     35:     print "  -purgedb : clear whole database\n";
                     36:     exit 1;
                     37: }
                     38: 
1.1       casties    39: # debug level
                     40: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
                     41: 
                     42: # XML namespace (not really implemented!)
                     43: my $namespace = "";
                     44: 
                     45: # delete and rebuild database
                     46: my $purgeDB = (exists $$args{'purgedb'});
                     47: 
                     48: # database connection
                     49: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
                     50: if (! $dbh) {
                     51:     logger('ABORT', "unable to connect to database!");
                     52:     exit 1;
                     53: }
                     54: $dbh->{AutoCommit} = 0;
                     55: my $dbNextFileId;
                     56: my $dbNewFile;
                     57: my $dbNewMeta;
                     58: my $dbClearMeta;
                     59: my $dbFindFileName;
                     60: my $dbFindFilePath;
                     61: my $dbClearFile;
                     62: my $dbFindFileFlag;
                     63: my $dbFindFileFlagPath;
                     64: my $dbSetFileFlag;
                     65: my $dbClearAllFileFlag;
                     66: 
                     67: #######################################################
                     68: # check parameters that were passed to the program
                     69: #
1.2       casties    70: my $baseurl;
                     71: my $indexurl;
                     72: my $singleurl;
1.1       casties    73: my $basedir = $$args{'path'};
1.2       casties    74: if ($basedir) {
                     75:     # strip trailing slashes
                     76:     $basedir =~ s/\/$//;
                     77:     if (! -d $basedir) {
                     78:    logger("ABORT", "document directory \'$basedir\' doesn't exist!");
                     79:    exit 1;
                     80:     }
                     81: } else {
                     82:     # use URL
                     83:     $baseurl = $$args{'baseurl'};
                     84:     $indexurl = $$args{'indexurl'};
                     85:     $singleurl = $$args{'url'};
                     86:     if (! (($indexurl)||($singleurl))) {
                     87:    logger("ABORT", "no document source given!");
                     88:    exit 1;
                     89:     }
1.1       casties    90: }
                     91: 
                     92: my $metaParserHandler = HarvestmetaHandler->new;
                     93: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
                     94: 
                     95: #######################################################
                     96: # internal variables
                     97: #
                     98: 
                     99: # number of errors
                    100: my $errcnt = 0;
                    101: # number of warnings
                    102: my $warncnt = 0;
                    103: 
                    104: # number of files on fs
                    105: my $fcnt = 0;
                    106: # number of index files
                    107: my $idxcnt = 0;
                    108: 
                    109: #######################################################
                    110: # subroutines
                    111: #
                    112: 
                    113: #
                    114: # readAllFiles($realdir, $basedir, \%files, \%dirs)
                    115: #
                    116: # reads all files and directories below $realdir and puts the
                    117: # files in %files and directories in %dirs
                    118: # $basedir is only for recursion, it should be empty when called 
                    119: # from outside
                    120: #
                    121: sub readAllFiles {
                    122:     my ($directory, $basedir) = @_;    
                    123:     my $cnt = 0;
                    124: 
                    125:     if (! opendir DIR, $directory) {
                    126:    return 0;
                    127:     }
                    128:     my @dirfiles = readdir DIR;
                    129:     foreach my $fn (@dirfiles) {
                    130:    # ignore names starting with a dot
                    131:    next if ($fn =~ /^\./);
                    132:    # ignore other silly files
                    133:    next if ($junk_files{$fn});
                    134: 
                    135:    $cnt++;
                    136:    $fcnt++;
                    137:    my $f = "$directory/$fn";
                    138:    my $docf = ($basedir) ? "$basedir/$fn" : $fn;
                    139:    #logger('DEBUG', "fs_file: \"$f\"");
                    140:    if (-f $f) {
                    141:        #logger("  is file");
                    142:        if ($fn eq "index.meta") {
                    143:        harvestFile($fn, $directory);
                    144:        }
                    145:    } elsif (-d _) {
                    146:        #logger("  is dir");
                    147:        # recurse into directory
                    148:        $cnt += readAllFiles($f, $docf);
                    149:    }
                    150:     }
                    151:     return $cnt;
                    152: }
                    153: 
                    154: #
                    155: # cleanUnmarkedFiles($basepath)
                    156: #
                    157: # deletes all unflagged file and meta entries.
                    158: #
                    159: sub cleanUnmarkedFiles {
                    160:     my ($basepath) = @_;
                    161:     my $rv = $dbFindFileFlagPath->execute("${basepath}%");
                    162:     my $ids = $dbFindFileFlagPath->fetchall_arrayref;
                    163:     for my $i (@$ids) {
                    164:    my $id = $$i[0];
                    165:    logger('DEBUG', "cleaning file and meta of id: $id");
                    166:    $dbClearMeta->execute($id);
                    167:    $dbClearFile->execute($id);
                    168:    $dbh->commit;
                    169:     }
                    170: }
                    171: 
                    172: #
                    173: # harvestFile($filename, $filepath)
                    174: #
                    175: # reads the index file $filename at $filepath and puts the contents
                    176: # in the database.
                    177: #
                    178: sub harvestFile {
                    179:     my ($filename, $filepath) = @_;
                    180:     logger('DEBUG', "looking at file '$filename' at '$filepath'");
                    181:     # get file time
                    182:     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                    183:    $atime,$mtime,$ctime,$blksize,$blocks)
                    184:    = stat("$filepath/$filename");
                    185:     my $filetime = stime($mtime);
                    186:     # register file in db
                    187:     my $fid = registerFile("$filepath/$filename", $filetime);
                    188:     if ($fid) {
                    189:    # file is new/modified
                    190:    # parse index file
1.2       casties   191:    my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
                    192:    if ($@) {
                    193:        my $errmsg = $@;
                    194:        logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
                    195:        $errcnt++;
                    196:        return;
                    197:    }
1.1       casties   198:    my @data = $metaParserHandler->getData();
                    199:    logger('DEBUG', "parsed $#data+1 elements");
1.2       casties   200:    if ($data[0][0] eq "html") {
                    201:        # oops, wrong
                    202:        logger('WARNING', "invalid HTML content in file $filepath/$filename");
                    203:        return;
                    204:    }
                    205:    registerMeta($fid, @data);
                    206:     }
                    207:     $idxcnt++;
                    208:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
                    209: }
                    210: 
                    211: #
                    212: # readURLIndex($baseurl)
                    213: #
                    214: # reads the XML index at $baseurl 
                    215: # and processes all its entries
                    216: #
                    217: sub readURLIndex {
                    218:     my ($baseurl) = @_;    
                    219:     my $cnt = 0;
                    220: 
                    221:     # parse index file
                    222:     logger('DEBUG', "parsing $baseurl ...");
                    223:     $metaParser->parse_uri($baseurl);
                    224:     my @indexdata = $metaParserHandler->getData();
                    225:     logger('INFO', "parsed $#indexdata+1 index entries");
                    226:    
                    227:     foreach my $me (@indexdata) {
                    228:    $cnt++;
                    229:    my ($tag, $val, $attr) = @$me;
1.4       casties   230:    my $meta = "";
                    231:    my $file = "";
1.5       casties   232:    my $mtime = "";
1.2       casties   233:    if ($tag =~ /index\/resource$/) {
                    234:        if ($attr =~ /metaLink=\"([^\"]+)\"/) {
                    235:        $meta = $1;
                    236:        }
                    237:        if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
                    238:        $file = $1;
                    239:        }
1.5       casties   240:        if ($attr =~ /modificationDate=\"([^\"]+)\"/) {
                    241:        $mtime = $1;
                    242:        }
1.2       casties   243:        if ($meta =~ /^http:/) {
1.5       casties   244:        harvestURL($meta, $file, $mtime);
1.2       casties   245:        }
                    246:    }
                    247:     }
                    248:     return $cnt;
                    249: }
                    250: 
                    251: #
                    252: # harvestURL($metaurl, $fileurl)
                    253: #
                    254: # reads the index file from $metaurl and puts the contents
                    255: # in the database (under $filepath)
                    256: #
                    257: sub harvestURL {
1.5       casties   258:     my ($metaurl, $fileurl, $filetime) = @_;
1.2       casties   259:     logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
1.5       casties   260:     # if no filetime then now
                    261:     $filetime = stime(time) unless ($filetime);
1.2       casties   262:     # register file in db
                    263:     my $fid = registerFile("$fileurl", $filetime);
                    264:     if ($fid) {
1.5       casties   265:    # try to parse index file
                    266:    my $ret = eval{$metaParser->parse_uri($metaurl)};
                    267:    if ($@) {
                    268:        my $errmsg = $@;
                    269:        logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
                    270:        $errcnt++;
                    271:        return;
                    272:    }
                    273:    my @data = $metaParserHandler->getData();
                    274:    logger('DEBUG', "parsed $#data+1 elements");
                    275:    if (lc $data[0][0] eq "html") {
                    276:        # oops, wrong
                    277:        logger('WARNING', "invalid HTML content from $metaurl");
                    278:        $warncnt++;
                    279:        return;
                    280:    }
1.2       casties   281:    # file is new/modified
1.1       casties   282:    registerMeta($fid, @data);
                    283:     }
                    284:     $idxcnt++;
                    285:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
                    286: }
                    287: 
1.2       casties   288: 
1.1       casties   289: #
                    290: # $fileid = registerFile($filepath, $filetime)
                    291: #
                    292: # returns the file ID for the file $filepath. If necessary it
                    293: # will be added to the database. returns 0 if an update is not necessary.
                    294: #
                    295: sub registerFile {
                    296:     my ($filepath, $filetime) = @_;
                    297:     my $fileid = 0;
                    298:     # look if file is in db
                    299:     my $rv = $dbFindFileName->execute($filepath);
                    300:     my $mtime;
                    301:     ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
                    302:     if ($fileid) {
                    303:    # file is in db
                    304:    # update flag
                    305:    $dbSetFileFlag->execute($fileid, 1);
                    306:    $dbh->commit;
                    307:    my $stime = s2stime($mtime);
                    308:    if ($stime ge $filetime) {
                    309:        # if its current return 0
                    310:        logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
                    311:        return 0;
                    312:    } else {
                    313:        logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
                    314:    }
                    315:     }
                    316:     if (! $fileid) {
                    317:    # get a new file id
                    318:    my $rv = $dbNextFileId->execute;
                    319:    ($fileid) = $dbNextFileId->fetchrow_array;
                    320:    logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
                    321:    $dbNewFile->execute($fileid, $filepath, $filetime);
                    322:    # update flag
                    323:    $dbSetFileFlag->execute($fileid, 1);
                    324:    $dbh->commit;
                    325:     }
                    326:     return $fileid;
                    327: }
                    328: 
                    329: #
                    330: # registerMeta($fileid, @meta)
                    331: #
                    332: # adds the metadata information @meta for $fileid to the database.
                    333: #
                    334: sub registerMeta {
                    335:     my ($fileid, @meta) = @_;
                    336:     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
1.2       casties   337:     # clear out old data
                    338:     $dbClearMeta->execute($fileid);
1.1       casties   339:     my $idx = 0;
                    340:     foreach my $keyval (@meta) {
                    341:    #logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
                    342:    $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
                    343:     }
                    344:     $dbh->commit;
                    345:     logger('INFO', "added $idx elements (file $fileid)");
                    346: }
                    347: 
                    348: #
                    349: # initdb()
                    350: #
                    351: # initialises the database connection.
                    352: #
                    353: sub initDB {
                    354:     my $rv;
                    355:     # clean tables
                    356:     if ($purgeDB) {
                    357:    $rv = $dbh->do("delete from files");
                    358:    $rv = $dbh->do("delete from meta");
                    359:    if ($dbh->err) {
                    360:        logger('ABORT', "unable to clean table!");
                    361:        exit 1;
                    362:    }
                    363:    $dbh->commit;
                    364:     }
                    365: 
                    366:     # clear flags
                    367:     $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
                    368:     $dbh->commit;
                    369: 
                    370:     # prepare statements
                    371:     $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
                    372:     $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
                    373:     $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
                    374:     $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
                    375:     $dbClearFile = $dbh->prepare("delete from files where id=?");
                    376:     $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
                    377:     $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
                    378:     $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
                    379:     $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
                    380:     $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
                    381: 
                    382: }
                    383: 
                    384: #######################################################
                    385: # main
                    386: #
                    387: 
                    388: logger("INFO", "harvestmeta $version");
                    389:  
                    390: initDB();
1.4       casties   391: my $fnum = 0;
1.1       casties   392: 
1.2       casties   393: if ($basedir) {
                    394:     # read and process all files under $basedir
                    395:     $fnum = readAllFiles($basedir, "");
                    396:     # delete orphaned data (under $basedir)
                    397:     cleanUnmarkedFiles($basedir);
1.4       casties   398: } elsif ($indexurl) {
1.2       casties   399:     # read and process XML index
                    400:     $fnum = readURLIndex($indexurl);
                    401:     if ($baseurl) {
                    402:    # delete orphaned data (under $baseurl)
                    403:    cleanUnmarkedFiles($baseurl);
                    404:     }
                    405: } elsif ($singleurl) {
                    406:     # read and process single XML url
                    407:     harvestURL($singleurl, $singleurl);
                    408:     $fnum = 1;
                    409:     if ($baseurl) {
                    410:    # delete orphaned data (under $baseurl)
                    411:    cleanUnmarkedFiles($baseurl);
                    412:     }
                    413: }
1.1       casties   414: 
                    415: logger("INFO", "analysed $idxcnt of $fnum files!");
                    416: logger("INFO", "$warncnt warnings");
                    417: logger("INFO", "$errcnt errors");
                    418: if ($errcnt > 0) {
                    419:     logger("ABORT", "there were errors!");
                    420:     exit 1;
                    421: } else {
                    422:     logger("DONE", "all index files read successfully!");
                    423: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>