#!/usr/local/bin/perl -w
use strict;
use XML::SAX;
use DBI;
use lib '/usr/local/mpiwg/archive';
use MPIWGStor;
use HarvestmetaHandler;
# make output unbuffered
$|=1;
#######################################################
# internal parameters
#
# program version
my $version = "0.1 (08.06.2004)";
# read command line parameters
my $args = MPIWGStor::parseargs;
# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
# XML namespace (not really implemented!)
my $namespace = "";
# delete and rebuild database
my $purgeDB = (exists $$args{'purgedb'});
# database connection
my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
if (! $dbh) {
logger('ABORT', "unable to connect to database!");
exit 1;
}
$dbh->{AutoCommit} = 0;
my $dbNextFileId;
my $dbNewFile;
my $dbNewMeta;
my $dbClearMeta;
my $dbFindFileName;
my $dbFindFilePath;
my $dbClearFile;
my $dbFindFileFlag;
my $dbFindFileFlagPath;
my $dbSetFileFlag;
my $dbClearAllFileFlag;
#######################################################
# check parameters that were passed to the program
#
my $basedir = $$args{'path'};
if (! $basedir) {
logger("ABORT", "no document directory given!");
exit 1;
}
# strip trailing slashes
$basedir =~ s/\/$//;
if (! -d $basedir) {
logger("ABORT", "document directory \'$basedir\' doesn't exist!");
exit 1;
}
my $metaParserHandler = HarvestmetaHandler->new;
my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
#######################################################
# internal variables
#
# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;
# number of files on fs
my $fcnt = 0;
# number of index files
my $idxcnt = 0;
#######################################################
# subroutines
#
#
# readAllFiles($realdir, $basedir, \%files, \%dirs)
#
# reads all files and directories below $realdir and puts the
# files in %files and directories in %dirs
# $basedir is only for recursion, it should be empty when called
# from outside
#
sub readAllFiles {
my ($directory, $basedir) = @_;
my $cnt = 0;
if (! opendir DIR, $directory) {
return 0;
}
my @dirfiles = readdir DIR;
foreach my $fn (@dirfiles) {
# ignore names starting with a dot
next if ($fn =~ /^\./);
# ignore other silly files
next if ($junk_files{$fn});
$cnt++;
$fcnt++;
my $f = "$directory/$fn";
my $docf = ($basedir) ? "$basedir/$fn" : $fn;
#logger('DEBUG', "fs_file: \"$f\"");
if (-f $f) {
#logger(" is file");
if ($fn eq "index.meta") {
harvestFile($fn, $directory);
}
} elsif (-d _) {
#logger(" is dir");
# recurse into directory
$cnt += readAllFiles($f, $docf);
}
}
return $cnt;
}
#
# cleanUnmarkedFiles($basepath)
#
# deletes all unflagged file and meta entries.
#
sub cleanUnmarkedFiles {
my ($basepath) = @_;
my $rv = $dbFindFileFlagPath->execute("${basepath}%");
my $ids = $dbFindFileFlagPath->fetchall_arrayref;
for my $i (@$ids) {
my $id = $$i[0];
logger('DEBUG', "cleaning file and meta of id: $id");
$dbClearMeta->execute($id);
$dbClearFile->execute($id);
$dbh->commit;
}
}
#
# harvestFile($filename, $filepath)
#
# reads the index file $filename at $filepath and puts the contents
# in the database.
#
sub harvestFile {
my ($filename, $filepath) = @_;
logger('DEBUG', "looking at file '$filename' at '$filepath'");
# get file time
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat("$filepath/$filename");
my $filetime = stime($mtime);
# register file in db
my $fid = registerFile("$filepath/$filename", $filetime);
if ($fid) {
# file is new/modified
# parse index file
$metaParser->parse_uri("$filepath/$filename");
my @data = $metaParserHandler->getData();
logger('DEBUG', "parsed $#data+1 elements");
registerMeta($fid, @data);
}
$idxcnt++;
logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
}
#
# $fileid = registerFile($filepath, $filetime)
#
# returns the file ID for the file $filepath. If necessary it
# will be added to the database. returns 0 if an update is not necessary.
#
sub registerFile {
my ($filepath, $filetime) = @_;
my $fileid = 0;
# look if file is in db
my $rv = $dbFindFileName->execute($filepath);
my $mtime;
($fileid, $mtime) = $dbFindFileName->fetchrow_array;
if ($fileid) {
# file is in db
# update flag
$dbSetFileFlag->execute($fileid, 1);
$dbh->commit;
my $stime = s2stime($mtime);
if ($stime ge $filetime) {
# if its current return 0
logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
return 0;
} else {
logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
}
}
if (! $fileid) {
# get a new file id
my $rv = $dbNextFileId->execute;
($fileid) = $dbNextFileId->fetchrow_array;
logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
$dbNewFile->execute($fileid, $filepath, $filetime);
# update flag
$dbSetFileFlag->execute($fileid, 1);
$dbh->commit;
}
return $fileid;
}
#
# registerMeta($fileid, @meta)
#
# adds the metadata information @meta for $fileid to the database.
#
sub registerMeta {
my ($fileid, @meta) = @_;
logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
my $idx = 0;
foreach my $keyval (@meta) {
#logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
}
$dbh->commit;
logger('INFO', "added $idx elements (file $fileid)");
}
#
# initdb()
#
# initialises the database connection.
#
sub initDB {
my $rv;
# clean tables
if ($purgeDB) {
$rv = $dbh->do("delete from files");
$rv = $dbh->do("delete from meta");
if ($dbh->err) {
logger('ABORT', "unable to clean table!");
exit 1;
}
$dbh->commit;
}
# clear flags
$rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
$dbh->commit;
# prepare statements
$dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
$dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
$dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
$dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
$dbClearFile = $dbh->prepare("delete from files where id=?");
$dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
$dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
$dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
$dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
$dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
}
#######################################################
# main
#
logger("INFO", "harvestmeta $version");
initDB();
# read and process all files under $basedir
my $fnum = readAllFiles($basedir, "");
# delete orphaned data (under $basedir)
cleanUnmarkedFiles($basedir);
logger("INFO", "analysed $idxcnt of $fnum files!");
logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
logger("ABORT", "there were errors!");
exit 1;
} else {
logger("DONE", "all index files read successfully!");
}
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>