version 1.1, 2004/06/17 15:58:42
|
version 1.4, 2004/07/13 18:32:49
|
Line 2
|
Line 2
|
|
|
use strict; |
use strict; |
use XML::SAX; |
use XML::SAX; |
|
use XML::LibXML; |
use DBI; |
use DBI; |
|
|
use lib '/usr/local/mpiwg/archive'; |
use lib '/usr/local/mpiwg/archive'; |
Line 16 $|=1;
|
Line 17 $|=1;
|
# |
# |
|
|
# program version |
# program version |
my $version = "0.1 (08.06.2004)"; |
my $version = "0.2.1 (13.07.2004)"; |
|
|
# read command line parameters |
# read command line parameters |
my $args = MPIWGStor::parseargs; |
my $args = MPIWGStor::parseargs; |
|
|
|
if (! scalar(%$args)) { |
|
print "harvestmeta $version\n"; |
|
print "use: harvestmeta -path=dir\n"; |
|
print " reads all metadata info from directory dir into the database\n"; |
|
print "alternative sources:\n"; |
|
print " -indexurl=url : read XML index and follow links\n"; |
|
print " -singleurl=url : read single index file\n"; |
|
print "additional options:\n"; |
|
print " -baseurl=url : clean all URL sources relative to this base\n"; |
|
print " -debug : output debugging info\n"; |
|
print " -purgedb : clear whole database\n"; |
|
exit 1; |
|
} |
|
|
# debug level |
# debug level |
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; |
$debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; |
|
|
Line 52 my $dbClearAllFileFlag;
|
Line 67 my $dbClearAllFileFlag;
|
####################################################### |
####################################################### |
# check parameters that were passed to the program |
# check parameters that were passed to the program |
# |
# |
|
my $baseurl; |
|
my $indexurl; |
|
my $singleurl; |
my $basedir = $$args{'path'}; |
my $basedir = $$args{'path'}; |
if (! $basedir) { |
if ($basedir) { |
logger("ABORT", "no document directory given!"); |
|
exit 1; |
|
} |
|
# strip trailing slashes |
# strip trailing slashes |
$basedir =~ s/\/$//; |
$basedir =~ s/\/$//; |
if (! -d $basedir) { |
if (! -d $basedir) { |
logger("ABORT", "document directory \'$basedir\' doesn't exist!"); |
logger("ABORT", "document directory \'$basedir\' doesn't exist!"); |
exit 1; |
exit 1; |
} |
} |
|
} else { |
|
# use URL |
|
$baseurl = $$args{'baseurl'}; |
|
$indexurl = $$args{'indexurl'}; |
|
$singleurl = $$args{'url'}; |
|
if (! (($indexurl)||($singleurl))) { |
|
logger("ABORT", "no document source given!"); |
|
exit 1; |
|
} |
|
} |
|
|
my $metaParserHandler = HarvestmetaHandler->new; |
my $metaParserHandler = HarvestmetaHandler->new; |
my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); |
my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); |
Line 163 sub harvestFile {
|
Line 188 sub harvestFile {
|
if ($fid) { |
if ($fid) { |
# file is new/modified |
# file is new/modified |
# parse index file |
# parse index file |
$metaParser->parse_uri("$filepath/$filename"); |
my $ret = eval{$metaParser->parse_uri("$filepath/$filename")}; |
|
if ($@) { |
|
my $errmsg = $@; |
|
logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)"); |
|
$errcnt++; |
|
return; |
|
} |
|
my @data = $metaParserHandler->getData(); |
|
logger('DEBUG', "parsed $#data+1 elements"); |
|
if ($data[0][0] eq "html") { |
|
# oops, wrong |
|
logger('WARNING', "invalid HTML content in file $filepath/$filename"); |
|
return; |
|
} |
|
registerMeta($fid, @data); |
|
} |
|
$idxcnt++; |
|
logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; |
|
} |
|
|
|
# |
|
# readURLIndex($baseurl) |
|
# |
|
# reads the XML index at $baseurl |
|
# and processes all its entries |
|
# |
|
sub readURLIndex { |
|
my ($baseurl) = @_; |
|
my $cnt = 0; |
|
|
|
# parse index file |
|
logger('DEBUG', "parsing $baseurl ..."); |
|
$metaParser->parse_uri($baseurl); |
|
my @indexdata = $metaParserHandler->getData(); |
|
logger('INFO', "parsed $#indexdata+1 index entries"); |
|
|
|
foreach my $me (@indexdata) { |
|
$cnt++; |
|
my ($tag, $val, $attr) = @$me; |
|
my $meta = ""; |
|
my $file = ""; |
|
if ($tag =~ /index\/resource$/) { |
|
if ($attr =~ /metaLink=\"([^\"]+)\"/) { |
|
$meta = $1; |
|
} |
|
if ($attr =~ /resourceLink=\"([^\"]+)\"/) { |
|
$file = $1; |
|
} |
|
if ($meta =~ /^http:/) { |
|
harvestURL($meta, $file); |
|
} |
|
} |
|
} |
|
return $cnt; |
|
} |
|
|
|
# |
|
# harvestURL($metaurl, $fileurl) |
|
# |
|
# reads the index file from $metaurl and puts the contents |
|
# in the database (under $filepath) |
|
# |
|
sub harvestURL { |
|
my ($metaurl, $fileurl) = @_; |
|
logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'"); |
|
# try to parse index file |
|
my $ret = eval{$metaParser->parse_uri($metaurl)}; |
|
if ($@) { |
|
my $errmsg = $@; |
|
logger('ERROR', "error reading XML from '$metaurl' ($errmsg)"); |
|
$errcnt++; |
|
return; |
|
} |
my @data = $metaParserHandler->getData(); |
my @data = $metaParserHandler->getData(); |
logger('DEBUG', "parsed $#data+1 elements"); |
logger('DEBUG', "parsed $#data+1 elements"); |
|
if (lc $data[0][0] eq "html") { |
|
# oops, wrong |
|
logger('WARNING', "invalid HTML content from $metaurl"); |
|
$warncnt++; |
|
return; |
|
} |
|
# filetime is now |
|
my $filetime = stime(time); |
|
# register file in db |
|
my $fid = registerFile("$fileurl", $filetime); |
|
if ($fid) { |
|
# file is new/modified |
registerMeta($fid, @data); |
registerMeta($fid, @data); |
} |
} |
$idxcnt++; |
$idxcnt++; |
logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; |
logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; |
} |
} |
|
|
|
|
# |
# |
# $fileid = registerFile($filepath, $filetime) |
# $fileid = registerFile($filepath, $filetime) |
# |
# |
Line 220 sub registerFile {
|
Line 330 sub registerFile {
|
sub registerMeta { |
sub registerMeta { |
my ($fileid, @meta) = @_; |
my ($fileid, @meta) = @_; |
logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); |
logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); |
|
# clear out old data |
|
$dbClearMeta->execute($fileid); |
my $idx = 0; |
my $idx = 0; |
foreach my $keyval (@meta) { |
foreach my $keyval (@meta) { |
#logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); |
#logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); |
Line 272 sub initDB {
|
Line 384 sub initDB {
|
logger("INFO", "harvestmeta $version"); |
logger("INFO", "harvestmeta $version"); |
|
|
initDB(); |
initDB(); |
|
my $fnum = 0; |
|
|
|
if ($basedir) { |
# read and process all files under $basedir |
# read and process all files under $basedir |
my $fnum = readAllFiles($basedir, ""); |
$fnum = readAllFiles($basedir, ""); |
# delete orphaned data (under $basedir) |
# delete orphaned data (under $basedir) |
cleanUnmarkedFiles($basedir); |
cleanUnmarkedFiles($basedir); |
|
} elsif ($indexurl) { |
|
# read and process XML index |
|
$fnum = readURLIndex($indexurl); |
|
if ($baseurl) { |
|
# delete orphaned data (under $baseurl) |
|
cleanUnmarkedFiles($baseurl); |
|
} |
|
} elsif ($singleurl) { |
|
# read and process single XML url |
|
harvestURL($singleurl, $singleurl); |
|
$fnum = 1; |
|
if ($baseurl) { |
|
# delete orphaned data (under $baseurl) |
|
cleanUnmarkedFiles($baseurl); |
|
} |
|
} |
|
|
logger("INFO", "analysed $idxcnt of $fnum files!"); |
logger("INFO", "analysed $idxcnt of $fnum files!"); |
logger("INFO", "$warncnt warnings"); |
logger("INFO", "$warncnt warnings"); |