Annotation of foxridge-archiver/harvestmeta.pl, revision 1.1

1.1     ! casties     1: #!/usr/local/bin/perl -w
        !             2: 
        !             3: use strict;
        !             4: use XML::SAX;
        !             5: use DBI;
        !             6: 
        !             7: use lib '/usr/local/mpiwg/archive';
        !             8: use MPIWGStor;
        !             9: use HarvestmetaHandler;
        !            10: 
        !            11: # make output unbuffered
        !            12: $|=1;
        !            13: 
        !            14: #######################################################
        !            15: # internal parameters
        !            16: #
        !            17: 
        !            18: # program version
        !            19: my $version = "0.1 (08.06.2004)";
        !            20: 
        !            21: # read command line parameters
        !            22: my $args = MPIWGStor::parseargs;
        !            23: 
        !            24: # debug level
        !            25: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
        !            26: 
        !            27: # XML namespace (not really implemented!)
        !            28: my $namespace = "";
        !            29: 
        !            30: # delete and rebuild database
        !            31: my $purgeDB = (exists $$args{'purgedb'});
        !            32: 
        !            33: # database connection
        !            34: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
        !            35: if (! $dbh) {
        !            36:     logger('ABORT', "unable to connect to database!");
        !            37:     exit 1;
        !            38: }
        !            39: $dbh->{AutoCommit} = 0;
        !            40: my $dbNextFileId;
        !            41: my $dbNewFile;
        !            42: my $dbNewMeta;
        !            43: my $dbClearMeta;
        !            44: my $dbFindFileName;
        !            45: my $dbFindFilePath;
        !            46: my $dbClearFile;
        !            47: my $dbFindFileFlag;
        !            48: my $dbFindFileFlagPath;
        !            49: my $dbSetFileFlag;
        !            50: my $dbClearAllFileFlag;
        !            51: 
        !            52: #######################################################
        !            53: # check parameters that were passed to the program
        !            54: #
        !            55: my $basedir = $$args{'path'};
        !            56: if (! $basedir) {
        !            57:     logger("ABORT", "no document directory given!");
        !            58:     exit 1;
        !            59: }
        !            60: # strip trailing slashes
        !            61: $basedir =~ s/\/$//;
        !            62: if (! -d $basedir) {
        !            63:     logger("ABORT", "document directory \'$basedir\' doesn't exist!");
        !            64:     exit 1;
        !            65: }
        !            66: 
        !            67: my $metaParserHandler = HarvestmetaHandler->new;
        !            68: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
        !            69: 
        !            70: #######################################################
        !            71: # internal variables
        !            72: #
        !            73: 
        !            74: # number of errors
        !            75: my $errcnt = 0;
        !            76: # number of warnings
        !            77: my $warncnt = 0;
        !            78: 
        !            79: # number of files on fs
        !            80: my $fcnt = 0;
        !            81: # number of index files
        !            82: my $idxcnt = 0;
        !            83: 
        !            84: #######################################################
        !            85: # subroutines
        !            86: #
        !            87: 
        !            88: #
        !            89: # readAllFiles($realdir, $basedir, \%files, \%dirs)
        !            90: #
        !            91: # reads all files and directories below $realdir and puts the
        !            92: # files in %files and directories in %dirs
        !            93: # $basedir is only for recursion, it should be empty when called 
        !            94: # from outside
        !            95: #
        !            96: sub readAllFiles {
        !            97:     my ($directory, $basedir) = @_;    
        !            98:     my $cnt = 0;
        !            99: 
        !           100:     if (! opendir DIR, $directory) {
        !           101:    return 0;
        !           102:     }
        !           103:     my @dirfiles = readdir DIR;
        !           104:     foreach my $fn (@dirfiles) {
        !           105:    # ignore names starting with a dot
        !           106:    next if ($fn =~ /^\./);
        !           107:    # ignore other silly files
        !           108:    next if ($junk_files{$fn});
        !           109: 
        !           110:    $cnt++;
        !           111:    $fcnt++;
        !           112:    my $f = "$directory/$fn";
        !           113:    my $docf = ($basedir) ? "$basedir/$fn" : $fn;
        !           114:    #logger('DEBUG', "fs_file: \"$f\"");
        !           115:    if (-f $f) {
        !           116:        #logger("  is file");
        !           117:        if ($fn eq "index.meta") {
        !           118:        harvestFile($fn, $directory);
        !           119:        }
        !           120:    } elsif (-d _) {
        !           121:        #logger("  is dir");
        !           122:        # recurse into directory
        !           123:        $cnt += readAllFiles($f, $docf);
        !           124:    }
        !           125:     }
        !           126:     return $cnt;
        !           127: }
        !           128: 
        !           129: #
        !           130: # cleanUnmarkedFiles($basepath)
        !           131: #
        !           132: # deletes all unflagged file and meta entries.
        !           133: #
        !           134: sub cleanUnmarkedFiles {
        !           135:     my ($basepath) = @_;
        !           136:     my $rv = $dbFindFileFlagPath->execute("${basepath}%");
        !           137:     my $ids = $dbFindFileFlagPath->fetchall_arrayref;
        !           138:     for my $i (@$ids) {
        !           139:    my $id = $$i[0];
        !           140:    logger('DEBUG', "cleaning file and meta of id: $id");
        !           141:    $dbClearMeta->execute($id);
        !           142:    $dbClearFile->execute($id);
        !           143:    $dbh->commit;
        !           144:     }
        !           145: }
        !           146: 
        !           147: #
        !           148: # harvestFile($filename, $filepath)
        !           149: #
        !           150: # reads the index file $filename at $filepath and puts the contents
        !           151: # in the database.
        !           152: #
        !           153: sub harvestFile {
        !           154:     my ($filename, $filepath) = @_;
        !           155:     logger('DEBUG', "looking at file '$filename' at '$filepath'");
        !           156:     # get file time
        !           157:     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        !           158:    $atime,$mtime,$ctime,$blksize,$blocks)
        !           159:    = stat("$filepath/$filename");
        !           160:     my $filetime = stime($mtime);
        !           161:     # register file in db
        !           162:     my $fid = registerFile("$filepath/$filename", $filetime);
        !           163:     if ($fid) {
        !           164:    # file is new/modified
        !           165:    # parse index file
        !           166:    $metaParser->parse_uri("$filepath/$filename");
        !           167:    my @data = $metaParserHandler->getData();
        !           168:    logger('DEBUG', "parsed $#data+1 elements");
        !           169:    registerMeta($fid, @data);
        !           170:     }
        !           171:     $idxcnt++;
        !           172:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
        !           173: }
        !           174: 
        !           175: #
        !           176: # $fileid = registerFile($filepath, $filetime)
        !           177: #
        !           178: # returns the file ID for the file $filepath. If necessary it
        !           179: # will be added to the database. returns 0 if an update is not necessary.
        !           180: #
        !           181: sub registerFile {
        !           182:     my ($filepath, $filetime) = @_;
        !           183:     my $fileid = 0;
        !           184:     # look if file is in db
        !           185:     my $rv = $dbFindFileName->execute($filepath);
        !           186:     my $mtime;
        !           187:     ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
        !           188:     if ($fileid) {
        !           189:    # file is in db
        !           190:    # update flag
        !           191:    $dbSetFileFlag->execute($fileid, 1);
        !           192:    $dbh->commit;
        !           193:    my $stime = s2stime($mtime);
        !           194:    if ($stime ge $filetime) {
        !           195:        # if its current return 0
        !           196:        logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
        !           197:        return 0;
        !           198:    } else {
        !           199:        logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
        !           200:    }
        !           201:     }
        !           202:     if (! $fileid) {
        !           203:    # get a new file id
        !           204:    my $rv = $dbNextFileId->execute;
        !           205:    ($fileid) = $dbNextFileId->fetchrow_array;
        !           206:    logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
        !           207:    $dbNewFile->execute($fileid, $filepath, $filetime);
        !           208:    # update flag
        !           209:    $dbSetFileFlag->execute($fileid, 1);
        !           210:    $dbh->commit;
        !           211:     }
        !           212:     return $fileid;
        !           213: }
        !           214: 
        !           215: #
        !           216: # registerMeta($fileid, @meta)
        !           217: #
        !           218: # adds the metadata information @meta for $fileid to the database.
        !           219: #
        !           220: sub registerMeta {
        !           221:     my ($fileid, @meta) = @_;
        !           222:     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
        !           223:     my $idx = 0;
        !           224:     foreach my $keyval (@meta) {
        !           225:    #logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
        !           226:    $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
        !           227:     }
        !           228:     $dbh->commit;
        !           229:     logger('INFO', "added $idx elements (file $fileid)");
        !           230: }
        !           231: 
        !           232: #
        !           233: # initdb()
        !           234: #
        !           235: # initialises the database connection.
        !           236: #
        !           237: sub initDB {
        !           238:     my $rv;
        !           239:     # clean tables
        !           240:     if ($purgeDB) {
        !           241:    $rv = $dbh->do("delete from files");
        !           242:    $rv = $dbh->do("delete from meta");
        !           243:    if ($dbh->err) {
        !           244:        logger('ABORT', "unable to clean table!");
        !           245:        exit 1;
        !           246:    }
        !           247:    $dbh->commit;
        !           248:     }
        !           249: 
        !           250:     # clear flags
        !           251:     $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
        !           252:     $dbh->commit;
        !           253: 
        !           254:     # prepare statements
        !           255:     $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
        !           256:     $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
        !           257:     $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
        !           258:     $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
        !           259:     $dbClearFile = $dbh->prepare("delete from files where id=?");
        !           260:     $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
        !           261:     $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
        !           262:     $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
        !           263:     $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
        !           264:     $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
        !           265: 
        !           266: }
        !           267: 
        !           268: #######################################################
        !           269: # main
        !           270: #
        !           271: 
        !           272: logger("INFO", "harvestmeta $version");
        !           273:  
        !           274: initDB();
        !           275: 
        !           276: # read and process all files under $basedir
        !           277: my $fnum = readAllFiles($basedir, "");
        !           278: # delete orphaned data (under $basedir)
        !           279: cleanUnmarkedFiles($basedir);
        !           280: 
        !           281: logger("INFO", "analysed $idxcnt of $fnum files!");
        !           282: logger("INFO", "$warncnt warnings");
        !           283: logger("INFO", "$errcnt errors");
        !           284: if ($errcnt > 0) {
        !           285:     logger("ABORT", "there were errors!");
        !           286:     exit 1;
        !           287: } else {
        !           288:     logger("DONE", "all index files read successfully!");
        !           289: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>