# HG changeset patch # User casties # Date 1089321724 -7200 # Node ID 1a51f94d5dbdcb12931d9b53f23c98a2d44c0de5 # Parent b7259a1c85aa3f62da5d9b82117ae12780447d07 new version also reads XML index feeds via HTTP diff -r b7259a1c85aa -r 1a51f94d5dbd HarvestmetaHandler.pm --- a/HarvestmetaHandler.pm Thu Jul 08 19:30:05 2004 +0200 +++ b/HarvestmetaHandler.pm Thu Jul 08 23:22:04 2004 +0200 @@ -8,15 +8,18 @@ use base qw(XML::SAX::Base); -use lib '/usr/local/mpiwg/archive'; +use lib '/usr/local/mpiwg/archive_devel'; use MPIWGStor; +my $debugElem = 0; +my $debugCont = 0; + my @currElemPath; my $currElem; my $currText; my $currAttrib; my @elements; - + sub getData { return @elements; } @@ -24,7 +27,7 @@ sub start_document { my ($self, $doc) = @_; # process document start event - #logger('DEBUG', "startdoc: $self, $doc"); + logger('DEBUG', "startdoc: $self, $doc") if ($debugElem); @currElemPath = (); $currElem = ""; $currText = ""; @@ -35,7 +38,7 @@ sub start_element { my ($self, $el) = @_; # process element start event - #logger('DEBUG', "startelem: $self, $el"); + logger('DEBUG', "startelem: $self, $$el{'LocalName'}") if ($debugElem); # check if the last element needs to be finished if ($currElem) { my $elem = join "/", @currElemPath; @@ -47,12 +50,13 @@ #logger('DEBUG', " name: $name"); # assemble attributes string $currAttrib =""; - foreach $a (values %{$$el{'Attributes'}}) { - my $n = $$a{'LocalName'}; - $n = $$a{'Name'} unless ($n); - my $v = $$a{'Value'}; - $currAttrib .= "$n=\"$v\" "; + foreach my $attr (values %{$$el{'Attributes'}}) { + my $key = $$attr{'LocalName'}; + $key = $$attr{'Name'} unless ($key); + my $val = $$attr{'Value'}; + $currAttrib .= "$key=\"$val\" "; } + $currAttrib = sstrip($currAttrib); # start element name push @currElemPath, $name; $currElem = $name; @@ -62,7 +66,7 @@ sub end_element { my ($self, $el) = @_; # process element end event - #logger('DEBUG', "endelem: $self, $el"); + logger('DEBUG', "endelem: $self, $$el{'LocalName'}") if ($debugElem); # check element name my $name = $$el{'LocalName'}; $name = $$el{'Name'} unless ($name); @@ -75,10 +79,10 @@ # strip whitespace from element content $currText =~ s/^\s*//; $currText =~ s/\s*$//; - if ($currText) { + if (($currText)||($currAttrib)) { # put pair in elements array push @elements, [$elem, $currText, $currAttrib]; - #logger('DEBUG', " elem: $elem = $currText"); + logger('DEBUG', " elem: $elem = $currText ($currAttrib)") if ($debugCont); } # end element name pop @currElemPath; @@ -90,10 +94,10 @@ sub characters { my ($self, $char) = @_; # process character data event - #logger('DEBUG', "characters: $self, $char"); + logger('DEBUG', "characters: $self, $char") if ($debugElem); # add to current content $currText .= $$char{'Data'}; - #logger('DEBUG', " Text: $currText"); + logger('DEBUG', " Text: $currText") if ($debugCont); } diff -r b7259a1c85aa -r 1a51f94d5dbd harvestmeta.pl --- a/harvestmeta.pl Thu Jul 08 19:30:05 2004 +0200 +++ b/harvestmeta.pl Thu Jul 08 23:22:04 2004 +0200 @@ -2,9 +2,10 @@ use strict; use XML::SAX; +use XML::LibXML; use DBI; -use lib '/usr/local/mpiwg/archive'; +use lib '/usr/local/mpiwg/archive_devel'; use MPIWGStor; use HarvestmetaHandler; @@ -16,11 +17,25 @@ # # program version -my $version = "0.1 (08.06.2004)"; +my $version = "0.2 (08.07.2004)"; # read command line parameters my $args = MPIWGStor::parseargs; +if (! scalar(%$args)) { + print "harvestmeta $version\n"; + print "use: harvestmeta -path=dir\n"; + print " reads all metadata info from directory dir into the database\n"; + print "alternative sources:\n"; + print " -indexurl=url : read XML index and follow links\n"; + print " -singleurl=url : read single index file\n"; + print "additional options:\n"; + print " -baseurl=url : clean all URL sources relative to this base\n"; + print " -debug : output debugging info\n"; + print " -purgedb : clear whole database\n"; + exit 1; +} + # debug level $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; @@ -52,16 +67,26 @@ ####################################################### # check parameters that were passed to the program # +my $baseurl; +my $indexurl; +my $singleurl; my $basedir = $$args{'path'}; -if (! $basedir) { - logger("ABORT", "no document directory given!"); - exit 1; -} -# strip trailing slashes -$basedir =~ s/\/$//; -if (! -d $basedir) { - logger("ABORT", "document directory \'$basedir\' doesn't exist!"); - exit 1; +if ($basedir) { + # strip trailing slashes + $basedir =~ s/\/$//; + if (! -d $basedir) { + logger("ABORT", "document directory \'$basedir\' doesn't exist!"); + exit 1; + } +} else { + # use URL + $baseurl = $$args{'baseurl'}; + $indexurl = $$args{'indexurl'}; + $singleurl = $$args{'url'}; + if (! (($indexurl)||($singleurl))) { + logger("ABORT", "no document source given!"); + exit 1; + } } my $metaParserHandler = HarvestmetaHandler->new; @@ -163,9 +188,20 @@ if ($fid) { # file is new/modified # parse index file - $metaParser->parse_uri("$filepath/$filename"); + my $ret = eval{$metaParser->parse_uri("$filepath/$filename")}; + if ($@) { + my $errmsg = $@; + logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)"); + $errcnt++; + return; + } my @data = $metaParserHandler->getData(); logger('DEBUG', "parsed $#data+1 elements"); + if ($data[0][0] eq "html") { + # oops, wrong + logger('WARNING', "invalid HTML content in file $filepath/$filename"); + return; + } registerMeta($fid, @data); } $idxcnt++; @@ -173,6 +209,80 @@ } # +# readURLIndex($baseurl) +# +# reads the XML index at $baseurl +# and processes all its entries +# +sub readURLIndex { + my ($baseurl) = @_; + my $cnt = 0; + + # parse index file + logger('DEBUG', "parsing $baseurl ..."); + $metaParser->parse_uri($baseurl); + my @indexdata = $metaParserHandler->getData(); + logger('INFO', "parsed $#indexdata+1 index entries"); + + foreach my $me (@indexdata) { + $cnt++; + my ($tag, $val, $attr) = @$me; + my $meta; + my $file; + if ($tag =~ /index\/resource$/) { + if ($attr =~ /metaLink=\"([^\"]+)\"/) { + $meta = $1; + } + if ($attr =~ /resourceLink=\"([^\"]+)\"/) { + $file = $1; + } + if ($meta =~ /^http:/) { + harvestURL($meta, $file); + } + } + } + return $cnt; +} + +# +# harvestURL($metaurl, $fileurl) +# +# reads the index file from $metaurl and puts the contents +# in the database (under $filepath) +# +sub harvestURL { + my ($metaurl, $fileurl) = @_; + logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'"); + # try to parse index file + my $ret = eval{$metaParser->parse_uri($metaurl)}; + if ($@) { + my $errmsg = $@; + logger('ERROR', "error reading XML from '$metaurl' ($errmsg)"); + $errcnt++; + return; + } + my @data = $metaParserHandler->getData(); + logger('DEBUG', "parsed $#data+1 elements"); + if ($data[0][0] eq "html") { + # oops, wrong + logger('WARNING', "invalid HTML content from $metaurl"); + $warncnt++; + return; + } + # filetime is now + my $filetime = stime(time); + # register file in db + my $fid = registerFile("$fileurl", $filetime); + if ($fid) { + # file is new/modified + registerMeta($fid, @data); + } + $idxcnt++; + logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; +} + + +# # $fileid = registerFile($filepath, $filetime) # # returns the file ID for the file $filepath. If necessary it @@ -220,6 +330,8 @@ sub registerMeta { my ($fileid, @meta) = @_; logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); + # clear out old data + $dbClearMeta->execute($fileid); my $idx = 0; foreach my $keyval (@meta) { #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); @@ -272,11 +384,29 @@ logger("INFO", "harvestmeta $version"); initDB(); +my $fnum; -# read and process all files under $basedir -my $fnum = readAllFiles($basedir, ""); -# delete orphaned data (under $basedir) -cleanUnmarkedFiles($basedir); +if ($basedir) { + # read and process all files under $basedir + $fnum = readAllFiles($basedir, ""); + # delete orphaned data (under $basedir) + cleanUnmarkedFiles($basedir); +} elsif ($baseurl) { + # read and process XML index + $fnum = readURLIndex($indexurl); + if ($baseurl) { + # delete orphaned data (under $baseurl) + cleanUnmarkedFiles($baseurl); + } +} elsif ($singleurl) { + # read and process single XML url + harvestURL($singleurl, $singleurl); + $fnum = 1; + if ($baseurl) { + # delete orphaned data (under $baseurl) + cleanUnmarkedFiles($baseurl); + } +} logger("INFO", "analysed $idxcnt of $fnum files!"); logger("INFO", "$warncnt warnings");