File:  [Repository] / foxridge-archiver / harvestmeta.pl
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Thu Jun 17 15:58:42 2004 UTC (19 years, 11 months ago) by casties
Branches: MAIN
CVS tags: HEAD
Initial revision

    1: #!/usr/local/bin/perl -w
    2: 
    3: use strict;
    4: use XML::SAX;
    5: use DBI;
    6: 
    7: use lib '/usr/local/mpiwg/archive';
    8: use MPIWGStor;
    9: use HarvestmetaHandler;
   10: 
   11: # make output unbuffered
   12: $|=1;
   13: 
   14: #######################################################
   15: # internal parameters
   16: #
   17: 
   18: # program version
   19: my $version = "0.1 (08.06.2004)";
   20: 
   21: # read command line parameters
   22: my $args = MPIWGStor::parseargs;
   23: 
   24: # debug level
   25: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
   26: 
   27: # XML namespace (not really implemented!)
   28: my $namespace = "";
   29: 
   30: # delete and rebuild database
   31: my $purgeDB = (exists $$args{'purgedb'});
   32: 
   33: # database connection
   34: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
   35: if (! $dbh) {
   36:     logger('ABORT', "unable to connect to database!");
   37:     exit 1;
   38: }
   39: $dbh->{AutoCommit} = 0;
   40: my $dbNextFileId;
   41: my $dbNewFile;
   42: my $dbNewMeta;
   43: my $dbClearMeta;
   44: my $dbFindFileName;
   45: my $dbFindFilePath;
   46: my $dbClearFile;
   47: my $dbFindFileFlag;
   48: my $dbFindFileFlagPath;
   49: my $dbSetFileFlag;
   50: my $dbClearAllFileFlag;
   51: 
   52: #######################################################
   53: # check parameters that were passed to the program
   54: #
   55: my $basedir = $$args{'path'};
   56: if (! $basedir) {
   57:     logger("ABORT", "no document directory given!");
   58:     exit 1;
   59: }
   60: # strip trailing slashes
   61: $basedir =~ s/\/$//;
   62: if (! -d $basedir) {
   63:     logger("ABORT", "document directory \'$basedir\' doesn't exist!");
   64:     exit 1;
   65: }
   66: 
   67: my $metaParserHandler = HarvestmetaHandler->new;
   68: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
   69: 
   70: #######################################################
   71: # internal variables
   72: #
   73: 
   74: # number of errors
   75: my $errcnt = 0;
   76: # number of warnings
   77: my $warncnt = 0;
   78: 
   79: # number of files on fs
   80: my $fcnt = 0;
   81: # number of index files
   82: my $idxcnt = 0;
   83: 
   84: #######################################################
   85: # subroutines
   86: #
   87: 
   88: #
   89: # readAllFiles($realdir, $basedir, \%files, \%dirs)
   90: #
   91: # reads all files and directories below $realdir and puts the
   92: # files in %files and directories in %dirs
   93: # $basedir is only for recursion, it should be empty when called 
   94: # from outside
   95: #
   96: sub readAllFiles {
   97:     my ($directory, $basedir) = @_;    
   98:     my $cnt = 0;
   99: 
  100:     if (! opendir DIR, $directory) {
  101: 	return 0;
  102:     }
  103:     my @dirfiles = readdir DIR;
  104:     foreach my $fn (@dirfiles) {
  105: 	# ignore names starting with a dot
  106: 	next if ($fn =~ /^\./);
  107: 	# ignore other silly files
  108: 	next if ($junk_files{$fn});
  109: 
  110: 	$cnt++;
  111: 	$fcnt++;
  112: 	my $f = "$directory/$fn";
  113: 	my $docf = ($basedir) ? "$basedir/$fn" : $fn;
  114: 	#logger('DEBUG', "fs_file: \"$f\"");
  115: 	if (-f $f) {
  116: 	    #logger("  is file");
  117: 	    if ($fn eq "index.meta") {
  118: 		harvestFile($fn, $directory);
  119: 	    }
  120: 	} elsif (-d _) {
  121: 	    #logger("  is dir");
  122: 	    # recurse into directory
  123: 	    $cnt += readAllFiles($f, $docf);
  124: 	}
  125:     }
  126:     return $cnt;
  127: }
  128: 
  129: #
  130: # cleanUnmarkedFiles($basepath)
  131: #
  132: # deletes all unflagged file and meta entries.
  133: #
  134: sub cleanUnmarkedFiles {
  135:     my ($basepath) = @_;
  136:     my $rv = $dbFindFileFlagPath->execute("${basepath}%");
  137:     my $ids = $dbFindFileFlagPath->fetchall_arrayref;
  138:     for my $i (@$ids) {
  139: 	my $id = $$i[0];
  140: 	logger('DEBUG', "cleaning file and meta of id: $id");
  141: 	$dbClearMeta->execute($id);
  142: 	$dbClearFile->execute($id);
  143: 	$dbh->commit;
  144:     }
  145: }
  146: 
  147: #
  148: # harvestFile($filename, $filepath)
  149: #
  150: # reads the index file $filename at $filepath and puts the contents
  151: # in the database.
  152: #
  153: sub harvestFile {
  154:     my ($filename, $filepath) = @_;
  155:     logger('DEBUG', "looking at file '$filename' at '$filepath'");
  156:     # get file time
  157:     my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
  158: 	$atime,$mtime,$ctime,$blksize,$blocks)
  159: 	= stat("$filepath/$filename");
  160:     my $filetime = stime($mtime);
  161:     # register file in db
  162:     my $fid = registerFile("$filepath/$filename", $filetime);
  163:     if ($fid) {
  164: 	# file is new/modified
  165: 	# parse index file
  166: 	$metaParser->parse_uri("$filepath/$filename");
  167: 	my @data = $metaParserHandler->getData();
  168: 	logger('DEBUG', "parsed $#data+1 elements");
  169: 	registerMeta($fid, @data);
  170:     }
  171:     $idxcnt++;
  172:     logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
  173: }
  174: 
  175: #
  176: # $fileid = registerFile($filepath, $filetime)
  177: #
  178: # returns the file ID for the file $filepath. If necessary it
  179: # will be added to the database. returns 0 if an update is not necessary.
  180: #
  181: sub registerFile {
  182:     my ($filepath, $filetime) = @_;
  183:     my $fileid = 0;
  184:     # look if file is in db
  185:     my $rv = $dbFindFileName->execute($filepath);
  186:     my $mtime;
  187:     ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
  188:     if ($fileid) {
  189: 	# file is in db
  190: 	# update flag
  191: 	$dbSetFileFlag->execute($fileid, 1);
  192: 	$dbh->commit;
  193: 	my $stime = s2stime($mtime);
  194: 	if ($stime ge $filetime) {
  195: 	    # if its current return 0
  196: 	    logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
  197: 	    return 0;
  198: 	} else {
  199: 	    logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
  200: 	}
  201:     }
  202:     if (! $fileid) {
  203: 	# get a new file id
  204: 	my $rv = $dbNextFileId->execute;
  205: 	($fileid) = $dbNextFileId->fetchrow_array;
  206: 	logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
  207: 	$dbNewFile->execute($fileid, $filepath, $filetime);
  208: 	# update flag
  209: 	$dbSetFileFlag->execute($fileid, 1);
  210: 	$dbh->commit;
  211:     }
  212:     return $fileid;
  213: }
  214: 
  215: #
  216: # registerMeta($fileid, @meta)
  217: #
  218: # adds the metadata information @meta for $fileid to the database.
  219: #
  220: sub registerMeta {
  221:     my ($fileid, @meta) = @_;
  222:     logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
  223:     my $idx = 0;
  224:     foreach my $keyval (@meta) {
  225: 	#logger('DEBUG', "  DB meta: $$keyval[0]=$$keyval[1]");
  226: 	$dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
  227:     }
  228:     $dbh->commit;
  229:     logger('INFO', "added $idx elements (file $fileid)");
  230: }
  231: 
  232: #
  233: # initdb()
  234: #
  235: # initialises the database connection.
  236: #
  237: sub initDB {
  238:     my $rv;
  239:     # clean tables
  240:     if ($purgeDB) {
  241: 	$rv = $dbh->do("delete from files");
  242: 	$rv = $dbh->do("delete from meta");
  243: 	if ($dbh->err) {
  244: 	    logger('ABORT', "unable to clean table!");
  245: 	    exit 1;
  246: 	}
  247: 	$dbh->commit;
  248:     }
  249: 
  250:     # clear flags
  251:     $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
  252:     $dbh->commit;
  253: 
  254:     # prepare statements
  255:     $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
  256:     $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
  257:     $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
  258:     $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
  259:     $dbClearFile = $dbh->prepare("delete from files where id=?");
  260:     $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
  261:     $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
  262:     $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
  263:     $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
  264:     $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
  265: 
  266: }
  267: 
  268: #######################################################
  269: # main
  270: #
  271: 
  272: logger("INFO", "harvestmeta $version");
  273:  
  274: initDB();
  275: 
  276: # read and process all files under $basedir
  277: my $fnum = readAllFiles($basedir, "");
  278: # delete orphaned data (under $basedir)
  279: cleanUnmarkedFiles($basedir);
  280: 
  281: logger("INFO", "analysed $idxcnt of $fnum files!");
  282: logger("INFO", "$warncnt warnings");
  283: logger("INFO", "$errcnt errors");
  284: if ($errcnt > 0) {
  285:     logger("ABORT", "there were errors!");
  286:     exit 1;
  287: } else {
  288:     logger("DONE", "all index files read successfully!");
  289: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>