Annotation of foxridge-archiver/harvestmeta.pl, revision 1.1
1.1 ! casties 1: #!/usr/local/bin/perl -w
! 2:
! 3: use strict;
! 4: use XML::SAX;
! 5: use DBI;
! 6:
! 7: use lib '/usr/local/mpiwg/archive';
! 8: use MPIWGStor;
! 9: use HarvestmetaHandler;
! 10:
! 11: # make output unbuffered
! 12: $|=1;
! 13:
! 14: #######################################################
! 15: # internal parameters
! 16: #
! 17:
! 18: # program version
! 19: my $version = "0.1 (08.06.2004)";
! 20:
! 21: # read command line parameters
! 22: my $args = MPIWGStor::parseargs;
! 23:
! 24: # debug level
! 25: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
! 26:
! 27: # XML namespace (not really implemented!)
! 28: my $namespace = "";
! 29:
! 30: # delete and rebuild database
! 31: my $purgeDB = (exists $$args{'purgedb'});
! 32:
! 33: # database connection
! 34: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
! 35: if (! $dbh) {
! 36: logger('ABORT', "unable to connect to database!");
! 37: exit 1;
! 38: }
! 39: $dbh->{AutoCommit} = 0;
! 40: my $dbNextFileId;
! 41: my $dbNewFile;
! 42: my $dbNewMeta;
! 43: my $dbClearMeta;
! 44: my $dbFindFileName;
! 45: my $dbFindFilePath;
! 46: my $dbClearFile;
! 47: my $dbFindFileFlag;
! 48: my $dbFindFileFlagPath;
! 49: my $dbSetFileFlag;
! 50: my $dbClearAllFileFlag;
! 51:
! 52: #######################################################
! 53: # check parameters that were passed to the program
! 54: #
! 55: my $basedir = $$args{'path'};
! 56: if (! $basedir) {
! 57: logger("ABORT", "no document directory given!");
! 58: exit 1;
! 59: }
! 60: # strip trailing slashes
! 61: $basedir =~ s/\/$//;
! 62: if (! -d $basedir) {
! 63: logger("ABORT", "document directory \'$basedir\' doesn't exist!");
! 64: exit 1;
! 65: }
! 66:
! 67: my $metaParserHandler = HarvestmetaHandler->new;
! 68: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
! 69:
! 70: #######################################################
! 71: # internal variables
! 72: #
! 73:
! 74: # number of errors
! 75: my $errcnt = 0;
! 76: # number of warnings
! 77: my $warncnt = 0;
! 78:
! 79: # number of files on fs
! 80: my $fcnt = 0;
! 81: # number of index files
! 82: my $idxcnt = 0;
! 83:
! 84: #######################################################
! 85: # subroutines
! 86: #
! 87:
! 88: #
! 89: # readAllFiles($realdir, $basedir, \%files, \%dirs)
! 90: #
! 91: # reads all files and directories below $realdir and puts the
! 92: # files in %files and directories in %dirs
! 93: # $basedir is only for recursion, it should be empty when called
! 94: # from outside
! 95: #
! 96: sub readAllFiles {
! 97: my ($directory, $basedir) = @_;
! 98: my $cnt = 0;
! 99:
! 100: if (! opendir DIR, $directory) {
! 101: return 0;
! 102: }
! 103: my @dirfiles = readdir DIR;
! 104: foreach my $fn (@dirfiles) {
! 105: # ignore names starting with a dot
! 106: next if ($fn =~ /^\./);
! 107: # ignore other silly files
! 108: next if ($junk_files{$fn});
! 109:
! 110: $cnt++;
! 111: $fcnt++;
! 112: my $f = "$directory/$fn";
! 113: my $docf = ($basedir) ? "$basedir/$fn" : $fn;
! 114: #logger('DEBUG', "fs_file: \"$f\"");
! 115: if (-f $f) {
! 116: #logger(" is file");
! 117: if ($fn eq "index.meta") {
! 118: harvestFile($fn, $directory);
! 119: }
! 120: } elsif (-d _) {
! 121: #logger(" is dir");
! 122: # recurse into directory
! 123: $cnt += readAllFiles($f, $docf);
! 124: }
! 125: }
! 126: return $cnt;
! 127: }
! 128:
! 129: #
! 130: # cleanUnmarkedFiles($basepath)
! 131: #
! 132: # deletes all unflagged file and meta entries.
! 133: #
! 134: sub cleanUnmarkedFiles {
! 135: my ($basepath) = @_;
! 136: my $rv = $dbFindFileFlagPath->execute("${basepath}%");
! 137: my $ids = $dbFindFileFlagPath->fetchall_arrayref;
! 138: for my $i (@$ids) {
! 139: my $id = $$i[0];
! 140: logger('DEBUG', "cleaning file and meta of id: $id");
! 141: $dbClearMeta->execute($id);
! 142: $dbClearFile->execute($id);
! 143: $dbh->commit;
! 144: }
! 145: }
! 146:
! 147: #
! 148: # harvestFile($filename, $filepath)
! 149: #
! 150: # reads the index file $filename at $filepath and puts the contents
! 151: # in the database.
! 152: #
! 153: sub harvestFile {
! 154: my ($filename, $filepath) = @_;
! 155: logger('DEBUG', "looking at file '$filename' at '$filepath'");
! 156: # get file time
! 157: my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
! 158: $atime,$mtime,$ctime,$blksize,$blocks)
! 159: = stat("$filepath/$filename");
! 160: my $filetime = stime($mtime);
! 161: # register file in db
! 162: my $fid = registerFile("$filepath/$filename", $filetime);
! 163: if ($fid) {
! 164: # file is new/modified
! 165: # parse index file
! 166: $metaParser->parse_uri("$filepath/$filename");
! 167: my @data = $metaParserHandler->getData();
! 168: logger('DEBUG', "parsed $#data+1 elements");
! 169: registerMeta($fid, @data);
! 170: }
! 171: $idxcnt++;
! 172: logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
! 173: }
! 174:
! 175: #
! 176: # $fileid = registerFile($filepath, $filetime)
! 177: #
! 178: # returns the file ID for the file $filepath. If necessary it
! 179: # will be added to the database. returns 0 if an update is not necessary.
! 180: #
! 181: sub registerFile {
! 182: my ($filepath, $filetime) = @_;
! 183: my $fileid = 0;
! 184: # look if file is in db
! 185: my $rv = $dbFindFileName->execute($filepath);
! 186: my $mtime;
! 187: ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
! 188: if ($fileid) {
! 189: # file is in db
! 190: # update flag
! 191: $dbSetFileFlag->execute($fileid, 1);
! 192: $dbh->commit;
! 193: my $stime = s2stime($mtime);
! 194: if ($stime ge $filetime) {
! 195: # if its current return 0
! 196: logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
! 197: return 0;
! 198: } else {
! 199: logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
! 200: }
! 201: }
! 202: if (! $fileid) {
! 203: # get a new file id
! 204: my $rv = $dbNextFileId->execute;
! 205: ($fileid) = $dbNextFileId->fetchrow_array;
! 206: logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
! 207: $dbNewFile->execute($fileid, $filepath, $filetime);
! 208: # update flag
! 209: $dbSetFileFlag->execute($fileid, 1);
! 210: $dbh->commit;
! 211: }
! 212: return $fileid;
! 213: }
! 214:
! 215: #
! 216: # registerMeta($fileid, @meta)
! 217: #
! 218: # adds the metadata information @meta for $fileid to the database.
! 219: #
! 220: sub registerMeta {
! 221: my ($fileid, @meta) = @_;
! 222: logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
! 223: my $idx = 0;
! 224: foreach my $keyval (@meta) {
! 225: #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
! 226: $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
! 227: }
! 228: $dbh->commit;
! 229: logger('INFO', "added $idx elements (file $fileid)");
! 230: }
! 231:
! 232: #
! 233: # initdb()
! 234: #
! 235: # initialises the database connection.
! 236: #
! 237: sub initDB {
! 238: my $rv;
! 239: # clean tables
! 240: if ($purgeDB) {
! 241: $rv = $dbh->do("delete from files");
! 242: $rv = $dbh->do("delete from meta");
! 243: if ($dbh->err) {
! 244: logger('ABORT', "unable to clean table!");
! 245: exit 1;
! 246: }
! 247: $dbh->commit;
! 248: }
! 249:
! 250: # clear flags
! 251: $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
! 252: $dbh->commit;
! 253:
! 254: # prepare statements
! 255: $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
! 256: $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
! 257: $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
! 258: $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
! 259: $dbClearFile = $dbh->prepare("delete from files where id=?");
! 260: $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
! 261: $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
! 262: $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
! 263: $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
! 264: $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
! 265:
! 266: }
! 267:
! 268: #######################################################
! 269: # main
! 270: #
! 271:
! 272: logger("INFO", "harvestmeta $version");
! 273:
! 274: initDB();
! 275:
! 276: # read and process all files under $basedir
! 277: my $fnum = readAllFiles($basedir, "");
! 278: # delete orphaned data (under $basedir)
! 279: cleanUnmarkedFiles($basedir);
! 280:
! 281: logger("INFO", "analysed $idxcnt of $fnum files!");
! 282: logger("INFO", "$warncnt warnings");
! 283: logger("INFO", "$errcnt errors");
! 284: if ($errcnt > 0) {
! 285: logger("ABORT", "there were errors!");
! 286: exit 1;
! 287: } else {
! 288: logger("DONE", "all index files read successfully!");
! 289: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>