diff harvestmeta.pl @ 0:30497c6a3eca

Initial revision
author casties
date Thu, 17 Jun 2004 17:58:42 +0200
parents
children 1a51f94d5dbd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/harvestmeta.pl	Thu Jun 17 17:58:42 2004 +0200
@@ -0,0 +1,289 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use XML::SAX;
+use DBI;
+
+use lib '/usr/local/mpiwg/archive';
+use MPIWGStor;
+use HarvestmetaHandler;
+
+# make output unbuffered
+$|=1;
+
+#######################################################
+# internal parameters
+#
+
+# program version
+my $version = "0.1 (08.06.2004)";
+
+# read command line parameters
+my $args = MPIWGStor::parseargs;
+
+# debug level
+$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
+
+# XML namespace (not really implemented!)
+my $namespace = "";
+
+# delete and rebuild database
+my $purgeDB = (exists $$args{'purgedb'});
+
+# database connection
+my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
+if (! $dbh) {
+    logger('ABORT', "unable to connect to database!");
+    exit 1;
+}
+$dbh->{AutoCommit} = 0;
+my $dbNextFileId;
+my $dbNewFile;
+my $dbNewMeta;
+my $dbClearMeta;
+my $dbFindFileName;
+my $dbFindFilePath;
+my $dbClearFile;
+my $dbFindFileFlag;
+my $dbFindFileFlagPath;
+my $dbSetFileFlag;
+my $dbClearAllFileFlag;
+
+#######################################################
+# check parameters that were passed to the program
+#
+my $basedir = $$args{'path'};
+if (! $basedir) {
+    logger("ABORT", "no document directory given!");
+    exit 1;
+}
+# strip trailing slashes
+$basedir =~ s/\/$//;
+if (! -d $basedir) {
+    logger("ABORT", "document directory \'$basedir\' doesn't exist!");
+    exit 1;
+}
+
+my $metaParserHandler = HarvestmetaHandler->new;
+my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
+
+#######################################################
+# internal variables
+#
+
+# number of errors
+my $errcnt = 0;
+# number of warnings
+my $warncnt = 0;
+
+# number of files on fs
+my $fcnt = 0;
+# number of index files
+my $idxcnt = 0;
+
+#######################################################
+# subroutines
+#
+
+#
+# readAllFiles($realdir, $basedir, \%files, \%dirs)
+#
+# reads all files and directories below $realdir and puts the
+# files in %files and directories in %dirs
+# $basedir is only for recursion, it should be empty when called 
+# from outside
+#
+sub readAllFiles {
+    my ($directory, $basedir) = @_;    
+    my $cnt = 0;
+
+    if (! opendir DIR, $directory) {
+	return 0;
+    }
+    my @dirfiles = readdir DIR;
+    foreach my $fn (@dirfiles) {
+	# ignore names starting with a dot
+	next if ($fn =~ /^\./);
+	# ignore other silly files
+	next if ($junk_files{$fn});
+
+	$cnt++;
+	$fcnt++;
+	my $f = "$directory/$fn";
+	my $docf = ($basedir) ? "$basedir/$fn" : $fn;
+	#logger('DEBUG', "fs_file: \"$f\"");
+	if (-f $f) {
+	    #logger("  is file");
+	    if ($fn eq "index.meta") {
+		harvestFile($fn, $directory);
+	    }
+	} elsif (-d _) {
+	    #logger("  is dir");
+	    # recurse into directory
+	    $cnt += readAllFiles($f, $docf);
+	}
+    }
+    return $cnt;
+}
+
+#
+# cleanUnmarkedFiles($basepath)
+#
+# deletes all unflagged file and meta entries.
+#
+sub cleanUnmarkedFiles {
+    my ($basepath) = @_;
+    my $rv = $dbFindFileFlagPath->execute("${basepath}%");
+    my $ids = $dbFindFileFlagPath->fetchall_arrayref;
+    for my $i (@$ids) {
+	my $id = $$i[0];
+	logger('DEBUG', "cleaning file and meta of id: $id");
+	$dbClearMeta->execute($id);
+	$dbClearFile->execute($id);
+	$dbh->commit;
+    }
+}
+
+#
+# harvestFile($filename, $filepath)
+#
+# reads the index file $filename at $filepath and puts the contents
+# in the database.
+#
+sub harvestFile {
+    my ($filename, $filepath) = @_;
+    logger('DEBUG', "looking at file '$filename' at '$filepath'");
+    # get file time
+    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+	$atime,$mtime,$ctime,$blksize,$blocks)
+	= stat("$filepath/$filename");
+    my $filetime = stime($mtime);
+    # register file in db
+    my $fid = registerFile("$filepath/$filename", $filetime);
+    if ($fid) {
+	# file is new/modified
+	# parse index file
+	$metaParser->parse_uri("$filepath/$filename");
+	my @data = $metaParserHandler->getData();
+	logger('DEBUG', "parsed $#data+1 elements");
+	registerMeta($fid, @data);
+    }
+    $idxcnt++;
+    logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
+}
+
+#
+# $fileid = registerFile($filepath, $filetime)
+#
+# returns the file ID for the file $filepath. If necessary it
+# will be added to the database. returns 0 if an update is not necessary.
+#
+sub registerFile {
+    my ($filepath, $filetime) = @_;
+    my $fileid = 0;
+    # look if file is in db
+    my $rv = $dbFindFileName->execute($filepath);
+    my $mtime;
+    ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
+    if ($fileid) {
+	# file is in db
+	# update flag
+	$dbSetFileFlag->execute($fileid, 1);
+	$dbh->commit;
+	my $stime = s2stime($mtime);
+	if ($stime ge $filetime) {
+	    # if its current return 0
+	    logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
+	    return 0;
+	} else {
+	    logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
+	}
+    }
+    if (! $fileid) {
+	# get a new file id
+	my $rv = $dbNextFileId->execute;
+	($fileid) = $dbNextFileId->fetchrow_array;
+	logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
+	$dbNewFile->execute($fileid, $filepath, $filetime);
+	# update flag
+	$dbSetFileFlag->execute($fileid, 1);
+	$dbh->commit;
+    }
+    return $fileid;
+}
+
+#
+# registerMeta($fileid, @meta)
+#
+# adds the metadata information @meta for $fileid to the database.
+#
+sub registerMeta {
+    my ($fileid, @meta) = @_;
+    logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
+    my $idx = 0;
+    foreach my $keyval (@meta) {
+	#logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
+	$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
+    }
+    $dbh->commit;
+    logger('INFO', "added $idx elements (file $fileid)");
+}
+
+#
+# initdb()
+#
+# initialises the database connection.
+#
+sub initDB {
+    my $rv;
+    # clean tables
+    if ($purgeDB) {
+	$rv = $dbh->do("delete from files");
+	$rv = $dbh->do("delete from meta");
+	if ($dbh->err) {
+	    logger('ABORT', "unable to clean table!");
+	    exit 1;
+	}
+	$dbh->commit;
+    }
+
+    # clear flags
+    $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
+    $dbh->commit;
+
+    # prepare statements
+    $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
+    $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
+    $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
+    $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
+    $dbClearFile = $dbh->prepare("delete from files where id=?");
+    $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
+    $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
+    $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
+    $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
+    $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
+
+}
+
+#######################################################
+# main
+#
+
+logger("INFO", "harvestmeta $version");
+ 
+initDB();
+
+# read and process all files under $basedir
+my $fnum = readAllFiles($basedir, "");
+# delete orphaned data (under $basedir)
+cleanUnmarkedFiles($basedir);
+
+logger("INFO", "analysed $idxcnt of $fnum files!");
+logger("INFO", "$warncnt warnings");
+logger("INFO", "$errcnt errors");
+if ($errcnt > 0) {
+    logger("ABORT", "there were errors!");
+    exit 1;
+} else {
+    logger("DONE", "all index files read successfully!");
+}