view harvestmeta.pl @ 18:fdf4ceb36db1

fixed problem with dir names in metacheck new version of metacheck defaults to not change index file new version of archiver uses new version of metacheck
author casties
date Tue, 20 Sep 2005 19:24:57 +0200
parents 65895eec9e30
children 2208ed7370cb
line wrap: on
line source

#!/usr/local/bin/perl -w

use strict;
use XML::SAX;
use XML::LibXML;
use DBI;

use lib '/usr/local/mpiwg/archive';
use MPIWGStor;
use HarvestmetaHandler;

# make output unbuffered
$|=1;

#######################################################
# internal parameters
#

# program version
my $version = "0.3 (27.9.2004)";

# read command line parameters
my $args = MPIWGStor::parseargs;

if (! scalar(%$args)) {
    print "harvestmeta $version\n";
    print "use: harvestmeta -path=dir\n";
    print "  reads all metadata info from directory dir into the database\n";
    print "alternative sources:\n";
    print "  -indexurl=url : read XML index and follow links\n";
    print "  -singleurl=url : read single index file\n";
    print "additional options:\n";
    print "  -baseurl=url : clean all URL sources relative to this base\n";
    print "  -debug : output debugging info\n";
    print "  -purgedb : clear whole database\n";
    exit 1;
}

# debug level
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;

# XML namespace (not really implemented!)
my $namespace = "";

# delete and rebuild database
my $purgeDB = (exists $$args{'purgedb'});

# database connection
my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
if (! $dbh) {
    logger('ABORT', "unable to connect to database!");
    exit 1;
}
$dbh->{AutoCommit} = 0;
my $dbNextFileId;
my $dbNewFile;
my $dbNewMeta;
my $dbClearMeta;
my $dbFindFileName;
my $dbFindFilePath;
my $dbClearFile;
my $dbFindFileFlag;
my $dbFindFileFlagPath;
my $dbSetFileFlag;
my $dbClearAllFileFlag;

#######################################################
# check parameters that were passed to the program
#
my $baseurl;
my $indexurl;
my $singleurl;
my $basedir = $$args{'path'};
if ($basedir) {
    # strip trailing slashes
    $basedir =~ s/\/$//;
    if (! -d $basedir) {
	logger("ABORT", "document directory \'$basedir\' doesn't exist!");
	exit 1;
    }
} else {
    # use URL
    $baseurl = $$args{'baseurl'};
    $indexurl = $$args{'indexurl'};
    $singleurl = $$args{'url'};
    if (! (($indexurl)||($singleurl))) {
	logger("ABORT", "no document source given!");
	exit 1;
    }
}

my $metaParserHandler = HarvestmetaHandler->new;
my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);

#######################################################
# internal variables
#

# number of errors
my $errcnt = 0;
# number of warnings
my $warncnt = 0;

# number of files on fs
my $fcnt = 0;
# number of index files
my $idxcnt = 0;

#######################################################
# subroutines
#

#
# readAllFiles($realdir, $basedir, \%files, \%dirs)
#
# reads all files and directories below $realdir and puts the
# files in %files and directories in %dirs
# $basedir is only for recursion, it should be empty when called 
# from outside
#
sub readAllFiles {
    my ($directory, $basedir) = @_;    
    my $cnt = 0;

    if (! opendir DIR, $directory) {
	return 0;
    }
    my @dirfiles = readdir DIR;
    foreach my $fn (@dirfiles) {
	# ignore names starting with a dot
	next if ($fn =~ /^\./);
	# ignore other silly files
	next if ($junk_files{$fn});

	$cnt++;
	$fcnt++;
	my $f = "$directory/$fn";
	my $docf = ($basedir) ? "$basedir/$fn" : $fn;
	#logger('DEBUG', "fs_file: \"$f\"");
	if (-f $f) {
	    #logger("  is file");
	    if ($fn eq "index.meta") {
		harvestFile($fn, $directory);
	    }
	} elsif (-d _) {
	    #logger("  is dir");
	    # recurse into directory
	    $cnt += readAllFiles($f, $docf);
	}
    }
    return $cnt;
}

#
# cleanUnmarkedFiles($basepath)
#
# deletes all unflagged file and meta entries.
#
sub cleanUnmarkedFiles {
    my ($basepath) = @_;
    my $rv = $dbFindFileFlagPath->execute("${basepath}%");
    my $ids = $dbFindFileFlagPath->fetchall_arrayref;
    for my $i (@$ids) {
	my $id = $$i[0];
	logger('DEBUG', "cleaning file and meta of id: $id");
	$dbClearMeta->execute($id);
	$dbClearFile->execute($id);
	$dbh->commit;
    }
}

#
# harvestFile($filename, $filepath)
#
# reads the index file $filename at $filepath and puts the contents
# in the database.
#
sub harvestFile {
    my ($filename, $filepath) = @_;
    logger('DEBUG', "looking at file '$filename' at '$filepath'");
    # get file time
    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks)
	= stat("$filepath/$filename");
    my $filetime = stime($mtime);
    # register file in db
    my $fid = registerFile("$filepath/$filename", $filetime);
    if ($fid) {
	# file is new/modified
	# parse index file
	my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
	if ($@) {
	    my $errmsg = $@;
	    logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
	    $errcnt++;
	    return;
	}
	my @data = $metaParserHandler->getData();
	logger('DEBUG', "parsed $#data+1 elements");
	if ($data[0][0] eq "html") {
	    # oops, wrong
	    logger('WARNING', "invalid HTML content in file $filepath/$filename");
	    return;
	}
	registerMeta($fid, @data);
    }
    $idxcnt++;
    logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
}

#
# readURLIndex($baseurl)
#
# reads the XML index at $baseurl 
# and processes all its entries
#
sub readURLIndex {
    my ($baseurl) = @_;    
    my $cnt = 0;

    # parse index file
    logger('DEBUG', "parsing $baseurl ...");
    $metaParser->parse_uri($baseurl);
    my @indexdata = $metaParserHandler->getData();
    logger('INFO', "parsed $#indexdata+1 index entries");
   
    foreach my $me (@indexdata) {
	$cnt++;
	my ($tag, $val, $attr) = @$me;
	my $meta = "";
	my $file = "";
	my $mtime = "";
	if ($tag =~ /index\/resource$/) {
	    if ($attr =~ /metaLink=\"([^\"]+)\"/) {
		$meta = $1;
	    }
	    if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
		$file = $1;
	    }
	    if ($attr =~ /modificationDate=\"([^\"]+)\"/) {
		$mtime = $1;
	    }
	    if ($meta =~ /^http:/) {
		harvestURL($meta, $file, $mtime);
	    }
	}
    }
    return $cnt;
}

#
# harvestURL($metaurl, $fileurl)
#
# reads the index file from $metaurl and puts the contents
# in the database (under $filepath)
#
sub harvestURL {
    my ($metaurl, $fileurl, $filetime) = @_;
    logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
    # if no filetime then now
    $filetime = stime(time) unless ($filetime);
    # register file in db
    my $fid = registerFile("$fileurl", $filetime);
    if ($fid) {
	# try to parse index file
	my $ret = eval{$metaParser->parse_uri($metaurl)};
	if ($@) {
	    my $errmsg = $@;
	    logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
	    $errcnt++;
	    return;
	}
	my @data = $metaParserHandler->getData();
	logger('DEBUG', "parsed $#data+1 elements");
	if (lc $data[0][0] eq "html") {
	    # oops, wrong
	    logger('WARNING', "invalid HTML content from $metaurl");
	    $warncnt++;
	    return;
	}
	# file is new/modified
	registerMeta($fid, @data);
    }
    $idxcnt++;
    logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
}


#
# $fileid = registerFile($filepath, $filetime)
#
# returns the file ID for the file $filepath. If necessary it
# will be added to the database. returns 0 if an update is not necessary.
#
sub registerFile {
    my ($filepath, $filetime) = @_;
    my $fileid = 0;
    # look if file is in db
    my $rv = $dbFindFileName->execute($filepath);
    my $mtime;
    ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
    if ($fileid) {
	# file is in db
	# update flag
	$dbSetFileFlag->execute($fileid, 1);
	$dbh->commit;
	my $stime = s2stime($mtime);
	if ($stime ge $filetime) {
	    # if its current return 0
	    logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
	    return 0;
	} else {
	    logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
	}
    }
    if (! $fileid) {
	# get a new file id
	my $rv = $dbNextFileId->execute;
	($fileid) = $dbNextFileId->fetchrow_array;
	logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
	$dbNewFile->execute($fileid, $filepath, $filetime);
	# update flag
	$dbSetFileFlag->execute($fileid, 1);
	$dbh->commit;
    }
    return $fileid;
}

#
# registerMeta($fileid, @meta)
#
# adds the metadata information @meta for $fileid to the database.
#
sub registerMeta {
    my ($fileid, @meta) = @_;
    logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
    # clear out old data
    $dbClearMeta->execute($fileid);
    my $idx = 0;
    foreach my $keyval (@meta) {
	#logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
	$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
    }
    $dbh->commit;
    logger('INFO', "added $idx elements (file $fileid)");
}

#
# initdb()
#
# initialises the database connection.
#
sub initDB {
    my $rv;
    # clean tables
    if ($purgeDB) {
	$rv = $dbh->do("delete from files");
	$rv = $dbh->do("delete from meta");
	if ($dbh->err) {
	    logger('ABORT', "unable to clean table!");
	    exit 1;
	}
	$dbh->commit;
    }

    # clear flags
    $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
    $dbh->commit;

    # prepare statements
    $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
    $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
    $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
    $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
    $dbClearFile = $dbh->prepare("delete from files where id=?");
    $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
    $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
    $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
    $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
    $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");

}

#######################################################
# main
#

logger("INFO", "harvestmeta $version");
 
initDB();
my $fnum = 0;

if ($basedir) {
    # read and process all files under $basedir
    $fnum = readAllFiles($basedir, "");
    # delete orphaned data (under $basedir)
    cleanUnmarkedFiles($basedir);
} elsif ($indexurl) {
    # read and process XML index
    $fnum = readURLIndex($indexurl);
    if ($baseurl) {
	# delete orphaned data (under $baseurl)
	cleanUnmarkedFiles($baseurl);
    }
} elsif ($singleurl) {
    # read and process single XML url
    harvestURL($singleurl, $singleurl);
    $fnum = 1;
    if ($baseurl) {
	# delete orphaned data (under $baseurl)
	cleanUnmarkedFiles($baseurl);
    }
}

logger("INFO", "analysed $idxcnt of $fnum files!");
logger("INFO", "$warncnt warnings");
logger("INFO", "$errcnt errors");
if ($errcnt > 0) {
    logger("ABORT", "there were errors!");
    exit 1;
} else {
    logger("DONE", "all index files read successfully!");
}