Annotation of foxridge-archiver/harvestmeta.pl, revision 1.1.1.1

1.1       casties     1: #!/usr/local/bin/perl -w
                      2: 
                      3: use strict;
                      4: use XML::SAX;
                      5: use DBI;
                      6: 
                      7: use lib '/usr/local/mpiwg/archive';
                      8: use MPIWGStor;
                      9: use HarvestmetaHandler;
                     10: 
                     11: # make output unbuffered
                     12: $|=1;
                     13: 
                     14: #######################################################
                     15: # internal parameters
                     16: #
                     17: 
                     18: # program version
                     19: my $version = "0.1 (08.06.2004)";
                     20: 
                     21: # read command line parameters
                     22: my $args = MPIWGStor::parseargs;
                     23: 
                     24: # debug level
                     25: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
                     26: 
                     27: # XML namespace (not really implemented!)
                     28: my $namespace = "";
                     29: 
                     30: # delete and rebuild database
                     31: my $purgeDB = (exists $$args{'purgedb'});
                     32: 
                     33: # database connection
                     34: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
                     35: if (! $dbh) {
                     36:     logger('ABORT', "unable to connect to database!");
                     37:     exit 1;
                     38: }
                     39: $dbh->{AutoCommit} = 0;
                     40: my $dbNextFileId;
                     41: my $dbNewFile;
                     42: my $dbNewMeta;
                     43: my $dbClearMeta;
                     44: my $dbFindFileName;
                     45: my $dbFindFilePath;
                     46: my $dbClearFile;
                     47: my $dbFindFileFlag;
                     48: my $dbFindFileFlagPath;
                     49: my $dbSetFileFlag;
                     50: my $dbClearAllFileFlag;
                     51: 
                     52: #######################################################
                     53: # check parameters that were passed to the program
                     54: #
                     55: my $basedir = $$args{'path'};
                     56: if (! $basedir) {
                     57:     logger("ABORT", "no document directory given!");
                     58:     exit 1;
                     59: }
                     60: # strip trailing slashes
                     61: $basedir =~ s/\/$//;
                     62: if (! -d $basedir) {
                     63:     logger("ABORT", "document directory \'$basedir\' doesn't exist!");
                     64:     exit 1;
                     65: }
                     66: 
                     67: my $metaParserHandler = HarvestmetaHandler->new;
                     68: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
                     69: 
                     70: #######################################################
                     71: # internal variables
                     72: #
                     73: 
                     74: # number of errors
                     75: my $errcnt = 0;
                     76: # number of warnings
                     77: my $warncnt = 0;
                     78: 
                     79: # number of files on fs
                     80: my $fcnt = 0;
                     81: # number of index files
                     82: my $idxcnt = 0;
                     83: 
                     84: #######################################################
                     85: # subroutines
                     86: #
                     87: 
                     88: #
                     89: # readAllFiles($realdir, $basedir, \%files, \%dirs)
                     90: #
                     91: # reads all files and directories below $realdir and puts the
                     92: # files in %files and directories in %dirs
                     93: # $basedir is only for recursion, it should be empty when called 
                     94: # from outside
                     95: #
                     96: sub readAllFiles {
                     97:     my ($directory, $basedir) = @_;    
                     98:     my $cnt = 0;
                     99: 
                    100:     if (! opendir DIR, $directory) {
                    101:    return 0;
                    102:     }
                    103:     my @dirfiles = readdir DIR;
                    104:     foreach my $fn (@dirfiles) {
                    105:    # ignore names starting with a dot
                    106:    next if ($fn =~ /^\./);
                    107:    # ignore other silly files
                    108:    next if ($junk_files{$fn});
                    109: 
                    110:    $cnt++;
                    111:    $fcnt++;
                    112:    my $f = "$directory/$fn";
                    113:    my $docf = ($basedir) ? "$basedir/$fn" : $fn;
                    114:    #logger('DEBUG', "fs_file: \"$f\"");
                    115:    if (-f $f) {
                    116:        #logger("  is file");
                    117:        if ($fn eq "index.meta") {
                    118:        harvestFile($fn, $directory);
                    119:        }
                    120:    } elsif (-d _) {
                    121:        #logger("  is dir");
                    122:        # recurse into directory
                    123:        $cnt += readAllFiles($f, $docf);
                    124:    }
                    125:     }
                    126:     return $cnt;
                    127: }
                    128: 
                    129: #
                    130: # cleanUnmarkedFiles($basepath)
                    131: #
                    132: # deletes all unflagged file and meta entries.
                    133: #
                    134: sub cleanUnmarkedFiles {
                    135:     my ($basepath) = @_;
                    136:     my $rv = $dbFindFileFlagPath->execute("${basepath}%");
                    137:     my $ids = $dbFindFileFlagPath->fetchall_arrayref;
                    138:     for my $i (@$ids) {
                    139:    my $id = $$i[0];
                    140:    logger('DEBUG', "cleaning file and meta of id: $id");
                    141:    $dbClearMeta->execute($id);
                    142:    $dbClearFile->execute($id);
                    143:    $dbh->commit;
                    144:     }
                    145: }
                    146: 
                    147: #
                    148: # harvestFile($filename, $filepath)
                    149: #
                    150: # reads the index file $filename at $filepath and puts the contents
                    151: # in the database.
                    152: #
                    153: sub harvestFile {
                    154:     my ($filename, $filepath) = @_;
                    155:     logger('DEBUG', "looking at file '$filename' at '$filepath'");
                    156:     # get file time
                    157:     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
                    158:    $atime,$mtime,$ctime,$blksize,$blocks)
                    159:    = stat("$filepath/$filename");
                    160:     my $filetime = stime($mtime);
                    161:     # register file in db
                    162:     my $fid = registerFile("$filepath/$filename", $filetime);
                    163:     if ($fid) {
                    164:    # file is new/modified
                    165:    # parse index file
                    166:    $metaParser->parse_uri("$filepath/$filename");
                    167:    my @data = $metaParserHandler->getData();
                    168:    logger('DEBUG', "parsed $#data+1 elements");
                    169:    registerMeta($fid, @data);
                    170:     }
                    171:     $idxcnt++;
                    172:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
                    173: }
                    174: 
                    175: #
                    176: # $fileid = registerFile($filepath, $filetime)
                    177: #
                    178: # returns the file ID for the file $filepath. If necessary it
                    179: # will be added to the database. returns 0 if an update is not necessary.
                    180: #
                    181: sub registerFile {
                    182:     my ($filepath, $filetime) = @_;
                    183:     my $fileid = 0;
                    184:     # look if file is in db
                    185:     my $rv = $dbFindFileName->execute($filepath);
                    186:     my $mtime;
                    187:     ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
                    188:     if ($fileid) {
                    189:    # file is in db
                    190:    # update flag
                    191:    $dbSetFileFlag->execute($fileid, 1);
                    192:    $dbh->commit;
                    193:    my $stime = s2stime($mtime);
                    194:    if ($stime ge $filetime) {
                    195:        # if its current return 0
                    196:        logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
                    197:        return 0;
                    198:    } else {
                    199:        logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
                    200:    }
                    201:     }
                    202:     if (! $fileid) {
                    203:    # get a new file id
                    204:    my $rv = $dbNextFileId->execute;
                    205:    ($fileid) = $dbNextFileId->fetchrow_array;
                    206:    logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
                    207:    $dbNewFile->execute($fileid, $filepath, $filetime);
                    208:    # update flag
                    209:    $dbSetFileFlag->execute($fileid, 1);
                    210:    $dbh->commit;
                    211:     }
                    212:     return $fileid;
                    213: }
                    214: 
                    215: #
                    216: # registerMeta($fileid, @meta)
                    217: #
                    218: # adds the metadata information @meta for $fileid to the database.
                    219: #
                    220: sub registerMeta {
                    221:     my ($fileid, @meta) = @_;
                    222:     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
                    223:     my $idx = 0;
                    224:     foreach my $keyval (@meta) {
                    225:    #logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
                    226:    $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
                    227:     }
                    228:     $dbh->commit;
                    229:     logger('INFO', "added $idx elements (file $fileid)");
                    230: }
                    231: 
                    232: #
                    233: # initdb()
                    234: #
                    235: # initialises the database connection.
                    236: #
                    237: sub initDB {
                    238:     my $rv;
                    239:     # clean tables
                    240:     if ($purgeDB) {
                    241:    $rv = $dbh->do("delete from files");
                    242:    $rv = $dbh->do("delete from meta");
                    243:    if ($dbh->err) {
                    244:        logger('ABORT', "unable to clean table!");
                    245:        exit 1;
                    246:    }
                    247:    $dbh->commit;
                    248:     }
                    249: 
                    250:     # clear flags
                    251:     $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
                    252:     $dbh->commit;
                    253: 
                    254:     # prepare statements
                    255:     $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
                    256:     $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
                    257:     $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
                    258:     $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
                    259:     $dbClearFile = $dbh->prepare("delete from files where id=?");
                    260:     $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
                    261:     $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
                    262:     $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
                    263:     $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
                    264:     $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
                    265: 
                    266: }
                    267: 
                    268: #######################################################
                    269: # main
                    270: #
                    271: 
                    272: logger("INFO", "harvestmeta $version");
                    273:  
                    274: initDB();
                    275: 
                    276: # read and process all files under $basedir
                    277: my $fnum = readAllFiles($basedir, "");
                    278: # delete orphaned data (under $basedir)
                    279: cleanUnmarkedFiles($basedir);
                    280: 
                    281: logger("INFO", "analysed $idxcnt of $fnum files!");
                    282: logger("INFO", "$warncnt warnings");
                    283: logger("INFO", "$errcnt errors");
                    284: if ($errcnt > 0) {
                    285:     logger("ABORT", "there were errors!");
                    286:     exit 1;
                    287: } else {
                    288:     logger("DONE", "all index files read successfully!");
                    289: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>