File:  [Repository] / foxridge-archiver / harvestmeta.pl
Revision 1.6: download - view: text, annotated - select for diffs - revision graph
Thu Mar 16 17:00:43 2017 UTC (7 years, 2 months ago) by casties
Branches: MAIN
CVS tags: HEAD
updated to Ubuntu Perl paths.

    1: #!/usr/bin/perl -w
    2: 
    3: use strict;
    4: use XML::SAX;
    5: use XML::LibXML;
    6: use DBI;
    7: 
    8: use lib '/usr/local/mpiwg/archive';
    9: use MPIWGStor;
   10: use HarvestmetaHandler;
   11: 
   12: # make output unbuffered
   13: $|=1;
   14: 
   15: #######################################################
   16: # internal parameters
   17: #
   18: 
   19: # program version
   20: my $version = "0.3 (27.9.2004)";
   21: 
   22: # read command line parameters
   23: my $args = MPIWGStor::parseargs;
   24: 
   25: if (! scalar(%$args)) {
   26:     print "harvestmeta $version\n";
   27:     print "use: harvestmeta -path=dir\n";
   28:     print "  reads all metadata info from directory dir into the database\n";
   29:     print "alternative sources:\n";
   30:     print "  -indexurl=url : read XML index and follow links\n";
   31:     print "  -singleurl=url : read single index file\n";
   32:     print "additional options:\n";
   33:     print "  -baseurl=url : clean all URL sources relative to this base\n";
   34:     print "  -debug : output debugging info\n";
   35:     print "  -purgedb : clear whole database\n";
   36:     exit 1;
   37: }
   38: 
   39: # debug level
   40: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
   41: 
   42: # XML namespace (not really implemented!)
   43: my $namespace = "";
   44: 
   45: # delete and rebuild database
   46: my $purgeDB = (exists $$args{'purgedb'});
   47: 
   48: # database connection
   49: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
   50: if (! $dbh) {
   51:     logger('ABORT', "unable to connect to database!");
   52:     exit 1;
   53: }
   54: $dbh->{AutoCommit} = 0;
   55: my $dbNextFileId;
   56: my $dbNewFile;
   57: my $dbNewMeta;
   58: my $dbClearMeta;
   59: my $dbFindFileName;
   60: my $dbFindFilePath;
   61: my $dbClearFile;
   62: my $dbFindFileFlag;
   63: my $dbFindFileFlagPath;
   64: my $dbSetFileFlag;
   65: my $dbClearAllFileFlag;
   66: 
   67: #######################################################
   68: # check parameters that were passed to the program
   69: #
   70: my $baseurl;
   71: my $indexurl;
   72: my $singleurl;
   73: my $basedir = $$args{'path'};
   74: if ($basedir) {
   75:     # strip trailing slashes
   76:     $basedir =~ s/\/$//;
   77:     if (! -d $basedir) {
   78: 	logger("ABORT", "document directory \'$basedir\' doesn't exist!");
   79: 	exit 1;
   80:     }
   81: } else {
   82:     # use URL
   83:     $baseurl = $$args{'baseurl'};
   84:     $indexurl = $$args{'indexurl'};
   85:     $singleurl = $$args{'url'};
   86:     if (! (($indexurl)||($singleurl))) {
   87: 	logger("ABORT", "no document source given!");
   88: 	exit 1;
   89:     }
   90: }
   91: 
   92: my $metaParserHandler = HarvestmetaHandler->new;
   93: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
   94: 
   95: #######################################################
   96: # internal variables
   97: #
   98: 
   99: # number of errors
  100: my $errcnt = 0;
  101: # number of warnings
  102: my $warncnt = 0;
  103: 
  104: # number of files on fs
  105: my $fcnt = 0;
  106: # number of index files
  107: my $idxcnt = 0;
  108: 
  109: #######################################################
  110: # subroutines
  111: #
  112: 
  113: #
  114: # readAllFiles($realdir, $basedir, \%files, \%dirs)
  115: #
  116: # reads all files and directories below $realdir and puts the
  117: # files in %files and directories in %dirs
  118: # $basedir is only for recursion, it should be empty when called 
  119: # from outside
  120: #
  121: sub readAllFiles {
  122:     my ($directory, $basedir) = @_;    
  123:     my $cnt = 0;
  124: 
  125:     if (! opendir DIR, $directory) {
  126: 	return 0;
  127:     }
  128:     my @dirfiles = readdir DIR;
  129:     foreach my $fn (@dirfiles) {
  130: 	# ignore names starting with a dot
  131: 	next if ($fn =~ /^\./);
  132: 	# ignore other silly files
  133: 	next if ($junk_files{$fn});
  134: 
  135: 	$cnt++;
  136: 	$fcnt++;
  137: 	my $f = "$directory/$fn";
  138: 	my $docf = ($basedir) ? "$basedir/$fn" : $fn;
  139: 	#logger('DEBUG', "fs_file: \"$f\"");
  140: 	if (-f $f) {
  141: 	    #logger("  is file");
  142: 	    if ($fn eq "index.meta") {
  143: 		harvestFile($fn, $directory);
  144: 	    }
  145: 	} elsif (-d _) {
  146: 	    #logger("  is dir");
  147: 	    # recurse into directory
  148: 	    $cnt += readAllFiles($f, $docf);
  149: 	}
  150:     }
  151:     return $cnt;
  152: }
  153: 
  154: #
  155: # cleanUnmarkedFiles($basepath)
  156: #
  157: # deletes all unflagged file and meta entries.
  158: #
  159: sub cleanUnmarkedFiles {
  160:     my ($basepath) = @_;
  161:     my $rv = $dbFindFileFlagPath->execute("${basepath}%");
  162:     my $ids = $dbFindFileFlagPath->fetchall_arrayref;
  163:     for my $i (@$ids) {
  164: 	my $id = $$i[0];
  165: 	logger('DEBUG', "cleaning file and meta of id: $id");
  166: 	$dbClearMeta->execute($id);
  167: 	$dbClearFile->execute($id);
  168: 	$dbh->commit;
  169:     }
  170: }
  171: 
  172: #
  173: # harvestFile($filename, $filepath)
  174: #
  175: # reads the index file $filename at $filepath and puts the contents
  176: # in the database.
  177: #
  178: sub harvestFile {
  179:     my ($filename, $filepath) = @_;
  180:     logger('DEBUG', "looking at file '$filename' at '$filepath'");
  181:     # get file time
  182:     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
  183: 	$atime,$mtime,$ctime,$blksize,$blocks)
  184: 	= stat("$filepath/$filename");
  185:     my $filetime = stime($mtime);
  186:     # register file in db
  187:     my $fid = registerFile("$filepath/$filename", $filetime);
  188:     if ($fid) {
  189: 	# file is new/modified
  190: 	# parse index file
  191: 	my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
  192: 	if ($@) {
  193: 	    my $errmsg = $@;
  194: 	    logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
  195: 	    $errcnt++;
  196: 	    return;
  197: 	}
  198: 	my @data = $metaParserHandler->getData();
  199: 	logger('DEBUG', "parsed $#data+1 elements");
  200: 	if ($data[0][0] eq "html") {
  201: 	    # oops, wrong
  202: 	    logger('WARNING', "invalid HTML content in file $filepath/$filename");
  203: 	    return;
  204: 	}
  205: 	registerMeta($fid, @data);
  206:     }
  207:     $idxcnt++;
  208:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
  209: }
  210: 
  211: #
  212: # readURLIndex($baseurl)
  213: #
  214: # reads the XML index at $baseurl 
  215: # and processes all its entries
  216: #
  217: sub readURLIndex {
  218:     my ($baseurl) = @_;    
  219:     my $cnt = 0;
  220: 
  221:     # parse index file
  222:     logger('DEBUG', "parsing $baseurl ...");
  223:     $metaParser->parse_uri($baseurl);
  224:     my @indexdata = $metaParserHandler->getData();
  225:     logger('INFO', "parsed $#indexdata+1 index entries");
  226:    
  227:     foreach my $me (@indexdata) {
  228: 	$cnt++;
  229: 	my ($tag, $val, $attr) = @$me;
  230: 	my $meta = "";
  231: 	my $file = "";
  232: 	my $mtime = "";
  233: 	if ($tag =~ /index\/resource$/) {
  234: 	    if ($attr =~ /metaLink=\"([^\"]+)\"/) {
  235: 		$meta = $1;
  236: 	    }
  237: 	    if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
  238: 		$file = $1;
  239: 	    }
  240: 	    if ($attr =~ /modificationDate=\"([^\"]+)\"/) {
  241: 		$mtime = $1;
  242: 	    }
  243: 	    if ($meta =~ /^http:/) {
  244: 		harvestURL($meta, $file, $mtime);
  245: 	    }
  246: 	}
  247:     }
  248:     return $cnt;
  249: }
  250: 
  251: #
  252: # harvestURL($metaurl, $fileurl)
  253: #
  254: # reads the index file from $metaurl and puts the contents
  255: # in the database (under $filepath)
  256: #
  257: sub harvestURL {
  258:     my ($metaurl, $fileurl, $filetime) = @_;
  259:     logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
  260:     # if no filetime then now
  261:     $filetime = stime(time) unless ($filetime);
  262:     # register file in db
  263:     my $fid = registerFile("$fileurl", $filetime);
  264:     if ($fid) {
  265: 	# try to parse index file
  266: 	my $ret = eval{$metaParser->parse_uri($metaurl)};
  267: 	if ($@) {
  268: 	    my $errmsg = $@;
  269: 	    logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
  270: 	    $errcnt++;
  271: 	    return;
  272: 	}
  273: 	my @data = $metaParserHandler->getData();
  274: 	logger('DEBUG', "parsed $#data+1 elements");
  275: 	if (lc $data[0][0] eq "html") {
  276: 	    # oops, wrong
  277: 	    logger('WARNING', "invalid HTML content from $metaurl");
  278: 	    $warncnt++;
  279: 	    return;
  280: 	}
  281: 	# file is new/modified
  282: 	registerMeta($fid, @data);
  283:     }
  284:     $idxcnt++;
  285:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
  286: }
  287: 
  288: 
  289: #
  290: # $fileid = registerFile($filepath, $filetime)
  291: #
  292: # returns the file ID for the file $filepath. If necessary it
  293: # will be added to the database. returns 0 if an update is not necessary.
  294: #
  295: sub registerFile {
  296:     my ($filepath, $filetime) = @_;
  297:     my $fileid = 0;
  298:     # look if file is in db
  299:     my $rv = $dbFindFileName->execute($filepath);
  300:     my $mtime;
  301:     ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
  302:     if ($fileid) {
  303: 	# file is in db
  304: 	# update flag
  305: 	$dbSetFileFlag->execute($fileid, 1);
  306: 	$dbh->commit;
  307: 	my $stime = s2stime($mtime);
  308: 	if ($stime ge $filetime) {
  309: 	    # if its current return 0
  310: 	    logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
  311: 	    return 0;
  312: 	} else {
  313: 	    logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
  314: 	}
  315:     }
  316:     if (! $fileid) {
  317: 	# get a new file id
  318: 	my $rv = $dbNextFileId->execute;
  319: 	($fileid) = $dbNextFileId->fetchrow_array;
  320: 	logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
  321: 	$dbNewFile->execute($fileid, $filepath, $filetime);
  322: 	# update flag
  323: 	$dbSetFileFlag->execute($fileid, 1);
  324: 	$dbh->commit;
  325:     }
  326:     return $fileid;
  327: }
  328: 
  329: #
  330: # registerMeta($fileid, @meta)
  331: #
  332: # adds the metadata information @meta for $fileid to the database.
  333: #
  334: sub registerMeta {
  335:     my ($fileid, @meta) = @_;
  336:     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
  337:     # clear out old data
  338:     $dbClearMeta->execute($fileid);
  339:     my $idx = 0;
  340:     foreach my $keyval (@meta) {
  341: 	#logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
  342: 	$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
  343:     }
  344:     $dbh->commit;
  345:     logger('INFO', "added $idx elements (file $fileid)");
  346: }
  347: 
  348: #
  349: # initdb()
  350: #
  351: # initialises the database connection.
  352: #
  353: sub initDB {
  354:     my $rv;
  355:     # clean tables
  356:     if ($purgeDB) {
  357: 	$rv = $dbh->do("delete from files");
  358: 	$rv = $dbh->do("delete from meta");
  359: 	if ($dbh->err) {
  360: 	    logger('ABORT', "unable to clean table!");
  361: 	    exit 1;
  362: 	}
  363: 	$dbh->commit;
  364:     }
  365: 
  366:     # clear flags
  367:     $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
  368:     $dbh->commit;
  369: 
  370:     # prepare statements
  371:     $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
  372:     $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
  373:     $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
  374:     $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
  375:     $dbClearFile = $dbh->prepare("delete from files where id=?");
  376:     $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
  377:     $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
  378:     $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
  379:     $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
  380:     $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
  381: 
  382: }
  383: 
  384: #######################################################
  385: # main
  386: #
  387: 
  388: logger("INFO", "harvestmeta $version");
  389:  
  390: initDB();
  391: my $fnum = 0;
  392: 
  393: if ($basedir) {
  394:     # read and process all files under $basedir
  395:     $fnum = readAllFiles($basedir, "");
  396:     # delete orphaned data (under $basedir)
  397:     cleanUnmarkedFiles($basedir);
  398: } elsif ($indexurl) {
  399:     # read and process XML index
  400:     $fnum = readURLIndex($indexurl);
  401:     if ($baseurl) {
  402: 	# delete orphaned data (under $baseurl)
  403: 	cleanUnmarkedFiles($baseurl);
  404:     }
  405: } elsif ($singleurl) {
  406:     # read and process single XML url
  407:     harvestURL($singleurl, $singleurl);
  408:     $fnum = 1;
  409:     if ($baseurl) {
  410: 	# delete orphaned data (under $baseurl)
  411: 	cleanUnmarkedFiles($baseurl);
  412:     }
  413: }
  414: 
  415: logger("INFO", "analysed $idxcnt of $fnum files!");
  416: logger("INFO", "$warncnt warnings");
  417: logger("INFO", "$errcnt errors");
  418: if ($errcnt > 0) {
  419:     logger("ABORT", "there were errors!");
  420:     exit 1;
  421: } else {
  422:     logger("DONE", "all index files read successfully!");
  423: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>