Diff for /foxridge-archiver/harvestmeta.pl between versions 1.1 and 1.2

version 1.1, 2004/06/17 15:58:42 version 1.2, 2004/07/08 21:22:04
Line 2 Line 2
   
 use strict;  use strict;
 use XML::SAX;  use XML::SAX;
   use XML::LibXML;
 use DBI;  use DBI;
   
 use lib '/usr/local/mpiwg/archive';  use lib '/usr/local/mpiwg/archive_devel';
 use MPIWGStor;  use MPIWGStor;
 use HarvestmetaHandler;  use HarvestmetaHandler;
   
Line 16  $|=1; Line 17  $|=1;
 #  #
   
 # program version  # program version
 my $version = "0.1 (08.06.2004)";  my $version = "0.2 (08.07.2004)";
   
 # read command line parameters  # read command line parameters
 my $args = MPIWGStor::parseargs;  my $args = MPIWGStor::parseargs;
   
   if (! scalar(%$args)) {
       print "harvestmeta $version\n";
       print "use: harvestmeta -path=dir\n";
       print "  reads all metadata info from directory dir into the database\n";
       print "alternative sources:\n";
       print "  -indexurl=url : read XML index and follow links\n";
       print "  -singleurl=url : read single index file\n";
       print "additional options:\n";
       print "  -baseurl=url : clean all URL sources relative to this base\n";
       print "  -debug : output debugging info\n";
       print "  -purgedb : clear whole database\n";
       exit 1;
   }
   
 # debug level  # debug level
 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;  $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
   
Line 52  my $dbClearAllFileFlag; Line 67  my $dbClearAllFileFlag;
 #######################################################  #######################################################
 # check parameters that were passed to the program  # check parameters that were passed to the program
 #  #
   my $baseurl;
   my $indexurl;
   my $singleurl;
 my $basedir = $$args{'path'};  my $basedir = $$args{'path'};
 if (! $basedir) {  if ($basedir) {
     logger("ABORT", "no document directory given!");  
     exit 1;  
 }  
 # strip trailing slashes  # strip trailing slashes
 $basedir =~ s/\/$//;  $basedir =~ s/\/$//;
 if (! -d $basedir) {  if (! -d $basedir) {
     logger("ABORT", "document directory \'$basedir\' doesn't exist!");      logger("ABORT", "document directory \'$basedir\' doesn't exist!");
     exit 1;      exit 1;
 }  }
   } else {
       # use URL
       $baseurl = $$args{'baseurl'};
       $indexurl = $$args{'indexurl'};
       $singleurl = $$args{'url'};
       if (! (($indexurl)||($singleurl))) {
       logger("ABORT", "no document source given!");
       exit 1;
       }
   }
   
 my $metaParserHandler = HarvestmetaHandler->new;  my $metaParserHandler = HarvestmetaHandler->new;
 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);  my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
Line 163  sub harvestFile { Line 188  sub harvestFile {
     if ($fid) {      if ($fid) {
     # file is new/modified      # file is new/modified
     # parse index file      # parse index file
     $metaParser->parse_uri("$filepath/$filename");      my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
       if ($@) {
           my $errmsg = $@;
           logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
           $errcnt++;
           return;
       }
       my @data = $metaParserHandler->getData();
       logger('DEBUG', "parsed $#data+1 elements");
       if ($data[0][0] eq "html") {
           # oops, wrong
           logger('WARNING', "invalid HTML content in file $filepath/$filename");
           return;
       }
       registerMeta($fid, @data);
       }
       $idxcnt++;
       logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
   }
   
   #
   # readURLIndex($baseurl)
   #
   # reads the XML index at $baseurl 
   # and processes all its entries
   #
   sub readURLIndex {
       my ($baseurl) = @_;    
       my $cnt = 0;
   
       # parse index file
       logger('DEBUG', "parsing $baseurl ...");
       $metaParser->parse_uri($baseurl);
       my @indexdata = $metaParserHandler->getData();
       logger('INFO', "parsed $#indexdata+1 index entries");
      
       foreach my $me (@indexdata) {
       $cnt++;
       my ($tag, $val, $attr) = @$me;
       my $meta;
       my $file;
       if ($tag =~ /index\/resource$/) {
           if ($attr =~ /metaLink=\"([^\"]+)\"/) {
           $meta = $1;
           }
           if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
           $file = $1;
           }
           if ($meta =~ /^http:/) {
           harvestURL($meta, $file);
           }
       }
       }
       return $cnt;
   }
   
   #
   # harvestURL($metaurl, $fileurl)
   #
   # reads the index file from $metaurl and puts the contents
   # in the database (under $filepath)
   #
   sub harvestURL {
       my ($metaurl, $fileurl) = @_;
       logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
       # try to parse index file
       my $ret = eval{$metaParser->parse_uri($metaurl)};
       if ($@) {
       my $errmsg = $@;
       logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
       $errcnt++;
       return;
       }
     my @data = $metaParserHandler->getData();      my @data = $metaParserHandler->getData();
     logger('DEBUG', "parsed $#data+1 elements");      logger('DEBUG', "parsed $#data+1 elements");
       if ($data[0][0] eq "html") {
       # oops, wrong
       logger('WARNING', "invalid HTML content from $metaurl");
       $warncnt++;
       return;
       }
       # filetime is now
       my $filetime = stime(time);
       # register file in db
       my $fid = registerFile("$fileurl", $filetime);
       if ($fid) {
       # file is new/modified
     registerMeta($fid, @data);      registerMeta($fid, @data);
     }      }
     $idxcnt++;      $idxcnt++;
     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;      logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
 }  }
   
   
 #  #
 # $fileid = registerFile($filepath, $filetime)  # $fileid = registerFile($filepath, $filetime)
 #  #
Line 220  sub registerFile { Line 330  sub registerFile {
 sub registerMeta {  sub registerMeta {
     my ($fileid, @meta) = @_;      my ($fileid, @meta) = @_;
     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");      logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
       # clear out old data
       $dbClearMeta->execute($fileid);
     my $idx = 0;      my $idx = 0;
     foreach my $keyval (@meta) {      foreach my $keyval (@meta) {
     #logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");      #logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
Line 272  sub initDB { Line 384  sub initDB {
 logger("INFO", "harvestmeta $version");  logger("INFO", "harvestmeta $version");
     
 initDB();  initDB();
   my $fnum;
   
   if ($basedir) {
 # read and process all files under $basedir  # read and process all files under $basedir
 my $fnum = readAllFiles($basedir, "");      $fnum = readAllFiles($basedir, "");
 # delete orphaned data (under $basedir)  # delete orphaned data (under $basedir)
 cleanUnmarkedFiles($basedir);  cleanUnmarkedFiles($basedir);
   } elsif ($baseurl) {
       # read and process XML index
       $fnum = readURLIndex($indexurl);
       if ($baseurl) {
       # delete orphaned data (under $baseurl)
       cleanUnmarkedFiles($baseurl);
       }
   } elsif ($singleurl) {
       # read and process single XML url
       harvestURL($singleurl, $singleurl);
       $fnum = 1;
       if ($baseurl) {
       # delete orphaned data (under $baseurl)
       cleanUnmarkedFiles($baseurl);
       }
   }
   
 logger("INFO", "analysed $idxcnt of $fnum files!");  logger("INFO", "analysed $idxcnt of $fnum files!");
 logger("INFO", "$warncnt warnings");  logger("INFO", "$warncnt warnings");

Removed from v.1.1  
changed lines
  Added in v.1.2


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>