Mercurial > hg > foxridge-archiver
diff harvestmeta.pl @ 0:30497c6a3eca
Initial revision
author | casties |
---|---|
date | Thu, 17 Jun 2004 17:58:42 +0200 |
parents | |
children | 1a51f94d5dbd |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/harvestmeta.pl Thu Jun 17 17:58:42 2004 +0200 @@ -0,0 +1,289 @@ +#!/usr/local/bin/perl -w + +use strict; +use XML::SAX; +use DBI; + +use lib '/usr/local/mpiwg/archive'; +use MPIWGStor; +use HarvestmetaHandler; + +# make output unbuffered +$|=1; + +####################################################### +# internal parameters +# + +# program version +my $version = "0.1 (08.06.2004)"; + +# read command line parameters +my $args = MPIWGStor::parseargs; + +# debug level +$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; + +# XML namespace (not really implemented!) +my $namespace = ""; + +# delete and rebuild database +my $purgeDB = (exists $$args{'purgedb'}); + +# database connection +my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", ""); +if (! $dbh) { + logger('ABORT', "unable to connect to database!"); + exit 1; +} +$dbh->{AutoCommit} = 0; +my $dbNextFileId; +my $dbNewFile; +my $dbNewMeta; +my $dbClearMeta; +my $dbFindFileName; +my $dbFindFilePath; +my $dbClearFile; +my $dbFindFileFlag; +my $dbFindFileFlagPath; +my $dbSetFileFlag; +my $dbClearAllFileFlag; + +####################################################### +# check parameters that were passed to the program +# +my $basedir = $$args{'path'}; +if (! $basedir) { + logger("ABORT", "no document directory given!"); + exit 1; +} +# strip trailing slashes +$basedir =~ s/\/$//; +if (! -d $basedir) { + logger("ABORT", "document directory \'$basedir\' doesn't exist!"); + exit 1; +} + +my $metaParserHandler = HarvestmetaHandler->new; +my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); + +####################################################### +# internal variables +# + +# number of errors +my $errcnt = 0; +# number of warnings +my $warncnt = 0; + +# number of files on fs +my $fcnt = 0; +# number of index files +my $idxcnt = 0; + +####################################################### +# subroutines +# + +# +# readAllFiles($realdir, $basedir, \%files, \%dirs) +# +# reads all files and directories below $realdir and puts the +# files in %files and directories in %dirs +# $basedir is only for recursion, it should be empty when called +# from outside +# +sub readAllFiles { + my ($directory, $basedir) = @_; + my $cnt = 0; + + if (! opendir DIR, $directory) { + return 0; + } + my @dirfiles = readdir DIR; + foreach my $fn (@dirfiles) { + # ignore names starting with a dot + next if ($fn =~ /^\./); + # ignore other silly files + next if ($junk_files{$fn}); + + $cnt++; + $fcnt++; + my $f = "$directory/$fn"; + my $docf = ($basedir) ? "$basedir/$fn" : $fn; + #logger('DEBUG', "fs_file: \"$f\""); + if (-f $f) { + #logger(" is file"); + if ($fn eq "index.meta") { + harvestFile($fn, $directory); + } + } elsif (-d _) { + #logger(" is dir"); + # recurse into directory + $cnt += readAllFiles($f, $docf); + } + } + return $cnt; +} + +# +# cleanUnmarkedFiles($basepath) +# +# deletes all unflagged file and meta entries. +# +sub cleanUnmarkedFiles { + my ($basepath) = @_; + my $rv = $dbFindFileFlagPath->execute("${basepath}%"); + my $ids = $dbFindFileFlagPath->fetchall_arrayref; + for my $i (@$ids) { + my $id = $$i[0]; + logger('DEBUG', "cleaning file and meta of id: $id"); + $dbClearMeta->execute($id); + $dbClearFile->execute($id); + $dbh->commit; + } +} + +# +# harvestFile($filename, $filepath) +# +# reads the index file $filename at $filepath and puts the contents +# in the database. +# +sub harvestFile { + my ($filename, $filepath) = @_; + logger('DEBUG', "looking at file '$filename' at '$filepath'"); + # get file time + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = stat("$filepath/$filename"); + my $filetime = stime($mtime); + # register file in db + my $fid = registerFile("$filepath/$filename", $filetime); + if ($fid) { + # file is new/modified + # parse index file + $metaParser->parse_uri("$filepath/$filename"); + my @data = $metaParserHandler->getData(); + logger('DEBUG', "parsed $#data+1 elements"); + registerMeta($fid, @data); + } + $idxcnt++; + logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; +} + +# +# $fileid = registerFile($filepath, $filetime) +# +# returns the file ID for the file $filepath. If necessary it +# will be added to the database. returns 0 if an update is not necessary. +# +sub registerFile { + my ($filepath, $filetime) = @_; + my $fileid = 0; + # look if file is in db + my $rv = $dbFindFileName->execute($filepath); + my $mtime; + ($fileid, $mtime) = $dbFindFileName->fetchrow_array; + if ($fileid) { + # file is in db + # update flag + $dbSetFileFlag->execute($fileid, 1); + $dbh->commit; + my $stime = s2stime($mtime); + if ($stime ge $filetime) { + # if its current return 0 + logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')"); + return 0; + } else { + logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')"); + } + } + if (! $fileid) { + # get a new file id + my $rv = $dbNextFileId->execute; + ($fileid) = $dbNextFileId->fetchrow_array; + logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime"); + $dbNewFile->execute($fileid, $filepath, $filetime); + # update flag + $dbSetFileFlag->execute($fileid, 1); + $dbh->commit; + } + return $fileid; +} + +# +# registerMeta($fileid, @meta) +# +# adds the metadata information @meta for $fileid to the database. +# +sub registerMeta { + my ($fileid, @meta) = @_; + logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); + my $idx = 0; + foreach my $keyval (@meta) { + #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); + $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]); + } + $dbh->commit; + logger('INFO', "added $idx elements (file $fileid)"); +} + +# +# initdb() +# +# initialises the database connection. +# +sub initDB { + my $rv; + # clean tables + if ($purgeDB) { + $rv = $dbh->do("delete from files"); + $rv = $dbh->do("delete from meta"); + if ($dbh->err) { + logger('ABORT', "unable to clean table!"); + exit 1; + } + $dbh->commit; + } + + # clear flags + $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )"); + $dbh->commit; + + # prepare statements + $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')"); + $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)"); + $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?"); + $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?"); + $dbClearFile = $dbh->prepare("delete from files where id=?"); + $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?"); + $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null"); + $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)"); + $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)"); + $dbClearMeta = $dbh->prepare("delete from meta where fileid=?"); + +} + +####################################################### +# main +# + +logger("INFO", "harvestmeta $version"); + +initDB(); + +# read and process all files under $basedir +my $fnum = readAllFiles($basedir, ""); +# delete orphaned data (under $basedir) +cleanUnmarkedFiles($basedir); + +logger("INFO", "analysed $idxcnt of $fnum files!"); +logger("INFO", "$warncnt warnings"); +logger("INFO", "$errcnt errors"); +if ($errcnt > 0) { + logger("ABORT", "there were errors!"); + exit 1; +} else { + logger("DONE", "all index files read successfully!"); +}