#!/usr/local/bin/perl -w use strict; use XML::SAX; use DBI; use lib '/usr/local/mpiwg/archive'; use MPIWGStor; use HarvestmetaHandler; # make output unbuffered $|=1; ####################################################### # internal parameters # # program version my $version = "0.1 (08.06.2004)"; # read command line parameters my $args = MPIWGStor::parseargs; # debug level $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; # XML namespace (not really implemented!) my $namespace = ""; # delete and rebuild database my $purgeDB = (exists $$args{'purgedb'}); # database connection my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", ""); if (! $dbh) { logger('ABORT', "unable to connect to database!"); exit 1; } $dbh->{AutoCommit} = 0; my $dbNextFileId; my $dbNewFile; my $dbNewMeta; my $dbClearMeta; my $dbFindFileName; my $dbFindFilePath; my $dbClearFile; my $dbFindFileFlag; my $dbFindFileFlagPath; my $dbSetFileFlag; my $dbClearAllFileFlag; ####################################################### # check parameters that were passed to the program # my $basedir = $$args{'path'}; if (! $basedir) { logger("ABORT", "no document directory given!"); exit 1; } # strip trailing slashes $basedir =~ s/\/$//; if (! -d $basedir) { logger("ABORT", "document directory \'$basedir\' doesn't exist!"); exit 1; } my $metaParserHandler = HarvestmetaHandler->new; my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); ####################################################### # internal variables # # number of errors my $errcnt = 0; # number of warnings my $warncnt = 0; # number of files on fs my $fcnt = 0; # number of index files my $idxcnt = 0; ####################################################### # subroutines # # # readAllFiles($realdir, $basedir, \%files, \%dirs) # # reads all files and directories below $realdir and puts the # files in %files and directories in %dirs # $basedir is only for recursion, it should be empty when called # from outside # sub readAllFiles { my ($directory, $basedir) = @_; my $cnt = 0; if (! opendir DIR, $directory) { return 0; } my @dirfiles = readdir DIR; foreach my $fn (@dirfiles) { # ignore names starting with a dot next if ($fn =~ /^\./); # ignore other silly files next if ($junk_files{$fn}); $cnt++; $fcnt++; my $f = "$directory/$fn"; my $docf = ($basedir) ? "$basedir/$fn" : $fn; #logger('DEBUG', "fs_file: \"$f\""); if (-f $f) { #logger(" is file"); if ($fn eq "index.meta") { harvestFile($fn, $directory); } } elsif (-d _) { #logger(" is dir"); # recurse into directory $cnt += readAllFiles($f, $docf); } } return $cnt; } # # cleanUnmarkedFiles($basepath) # # deletes all unflagged file and meta entries. # sub cleanUnmarkedFiles { my ($basepath) = @_; my $rv = $dbFindFileFlagPath->execute("${basepath}%"); my $ids = $dbFindFileFlagPath->fetchall_arrayref; for my $i (@$ids) { my $id = $$i[0]; logger('DEBUG', "cleaning file and meta of id: $id"); $dbClearMeta->execute($id); $dbClearFile->execute($id); $dbh->commit; } } # # harvestFile($filename, $filepath) # # reads the index file $filename at $filepath and puts the contents # in the database. # sub harvestFile { my ($filename, $filepath) = @_; logger('DEBUG', "looking at file '$filename' at '$filepath'"); # get file time my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat("$filepath/$filename"); my $filetime = stime($mtime); # register file in db my $fid = registerFile("$filepath/$filename", $filetime); if ($fid) { # file is new/modified # parse index file $metaParser->parse_uri("$filepath/$filename"); my @data = $metaParserHandler->getData(); logger('DEBUG', "parsed $#data+1 elements"); registerMeta($fid, @data); } $idxcnt++; logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; } # # $fileid = registerFile($filepath, $filetime) # # returns the file ID for the file $filepath. If necessary it # will be added to the database. returns 0 if an update is not necessary. # sub registerFile { my ($filepath, $filetime) = @_; my $fileid = 0; # look if file is in db my $rv = $dbFindFileName->execute($filepath); my $mtime; ($fileid, $mtime) = $dbFindFileName->fetchrow_array; if ($fileid) { # file is in db # update flag $dbSetFileFlag->execute($fileid, 1); $dbh->commit; my $stime = s2stime($mtime); if ($stime ge $filetime) { # if its current return 0 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')"); return 0; } else { logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')"); } } if (! $fileid) { # get a new file id my $rv = $dbNextFileId->execute; ($fileid) = $dbNextFileId->fetchrow_array; logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime"); $dbNewFile->execute($fileid, $filepath, $filetime); # update flag $dbSetFileFlag->execute($fileid, 1); $dbh->commit; } return $fileid; } # # registerMeta($fileid, @meta) # # adds the metadata information @meta for $fileid to the database. # sub registerMeta { my ($fileid, @meta) = @_; logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); my $idx = 0; foreach my $keyval (@meta) { #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]); } $dbh->commit; logger('INFO', "added $idx elements (file $fileid)"); } # # initdb() # # initialises the database connection. # sub initDB { my $rv; # clean tables if ($purgeDB) { $rv = $dbh->do("delete from files"); $rv = $dbh->do("delete from meta"); if ($dbh->err) { logger('ABORT', "unable to clean table!"); exit 1; } $dbh->commit; } # clear flags $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )"); $dbh->commit; # prepare statements $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')"); $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)"); $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?"); $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?"); $dbClearFile = $dbh->prepare("delete from files where id=?"); $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?"); $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null"); $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)"); $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)"); $dbClearMeta = $dbh->prepare("delete from meta where fileid=?"); } ####################################################### # main # logger("INFO", "harvestmeta $version"); initDB(); # read and process all files under $basedir my $fnum = readAllFiles($basedir, ""); # delete orphaned data (under $basedir) cleanUnmarkedFiles($basedir); logger("INFO", "analysed $idxcnt of $fnum files!"); logger("INFO", "$warncnt warnings"); logger("INFO", "$errcnt errors"); if ($errcnt > 0) { logger("ABORT", "there were errors!"); exit 1; } else { logger("DONE", "all index files read successfully!"); }