Annotation of foxridge-archiver/harvestmeta.pl, revision 1.2
1.1 casties 1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::SAX;
1.2 ! casties 5: use XML::LibXML;
1.1 casties 6: use DBI;
7:
1.2 ! casties 8: use lib '/usr/local/mpiwg/archive_devel';
1.1 casties 9: use MPIWGStor;
10: use HarvestmetaHandler;
11:
12: # make output unbuffered
13: $|=1;
14:
15: #######################################################
16: # internal parameters
17: #
18:
19: # program version
1.2 ! casties 20: my $version = "0.2 (08.07.2004)";
1.1 casties 21:
22: # read command line parameters
23: my $args = MPIWGStor::parseargs;
24:
1.2 ! casties 25: if (! scalar(%$args)) {
! 26: print "harvestmeta $version\n";
! 27: print "use: harvestmeta -path=dir\n";
! 28: print " reads all metadata info from directory dir into the database\n";
! 29: print "alternative sources:\n";
! 30: print " -indexurl=url : read XML index and follow links\n";
! 31: print " -singleurl=url : read single index file\n";
! 32: print "additional options:\n";
! 33: print " -baseurl=url : clean all URL sources relative to this base\n";
! 34: print " -debug : output debugging info\n";
! 35: print " -purgedb : clear whole database\n";
! 36: exit 1;
! 37: }
! 38:
1.1 casties 39: # debug level
40: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
41:
42: # XML namespace (not really implemented!)
43: my $namespace = "";
44:
45: # delete and rebuild database
46: my $purgeDB = (exists $$args{'purgedb'});
47:
48: # database connection
49: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
50: if (! $dbh) {
51: logger('ABORT', "unable to connect to database!");
52: exit 1;
53: }
54: $dbh->{AutoCommit} = 0;
55: my $dbNextFileId;
56: my $dbNewFile;
57: my $dbNewMeta;
58: my $dbClearMeta;
59: my $dbFindFileName;
60: my $dbFindFilePath;
61: my $dbClearFile;
62: my $dbFindFileFlag;
63: my $dbFindFileFlagPath;
64: my $dbSetFileFlag;
65: my $dbClearAllFileFlag;
66:
67: #######################################################
68: # check parameters that were passed to the program
69: #
1.2 ! casties 70: my $baseurl;
! 71: my $indexurl;
! 72: my $singleurl;
1.1 casties 73: my $basedir = $$args{'path'};
1.2 ! casties 74: if ($basedir) {
! 75: # strip trailing slashes
! 76: $basedir =~ s/\/$//;
! 77: if (! -d $basedir) {
! 78: logger("ABORT", "document directory \'$basedir\' doesn't exist!");
! 79: exit 1;
! 80: }
! 81: } else {
! 82: # use URL
! 83: $baseurl = $$args{'baseurl'};
! 84: $indexurl = $$args{'indexurl'};
! 85: $singleurl = $$args{'url'};
! 86: if (! (($indexurl)||($singleurl))) {
! 87: logger("ABORT", "no document source given!");
! 88: exit 1;
! 89: }
1.1 casties 90: }
91:
92: my $metaParserHandler = HarvestmetaHandler->new;
93: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
94:
95: #######################################################
96: # internal variables
97: #
98:
99: # number of errors
100: my $errcnt = 0;
101: # number of warnings
102: my $warncnt = 0;
103:
104: # number of files on fs
105: my $fcnt = 0;
106: # number of index files
107: my $idxcnt = 0;
108:
109: #######################################################
110: # subroutines
111: #
112:
113: #
114: # readAllFiles($realdir, $basedir, \%files, \%dirs)
115: #
116: # reads all files and directories below $realdir and puts the
117: # files in %files and directories in %dirs
118: # $basedir is only for recursion, it should be empty when called
119: # from outside
120: #
121: sub readAllFiles {
122: my ($directory, $basedir) = @_;
123: my $cnt = 0;
124:
125: if (! opendir DIR, $directory) {
126: return 0;
127: }
128: my @dirfiles = readdir DIR;
129: foreach my $fn (@dirfiles) {
130: # ignore names starting with a dot
131: next if ($fn =~ /^\./);
132: # ignore other silly files
133: next if ($junk_files{$fn});
134:
135: $cnt++;
136: $fcnt++;
137: my $f = "$directory/$fn";
138: my $docf = ($basedir) ? "$basedir/$fn" : $fn;
139: #logger('DEBUG', "fs_file: \"$f\"");
140: if (-f $f) {
141: #logger(" is file");
142: if ($fn eq "index.meta") {
143: harvestFile($fn, $directory);
144: }
145: } elsif (-d _) {
146: #logger(" is dir");
147: # recurse into directory
148: $cnt += readAllFiles($f, $docf);
149: }
150: }
151: return $cnt;
152: }
153:
154: #
155: # cleanUnmarkedFiles($basepath)
156: #
157: # deletes all unflagged file and meta entries.
158: #
159: sub cleanUnmarkedFiles {
160: my ($basepath) = @_;
161: my $rv = $dbFindFileFlagPath->execute("${basepath}%");
162: my $ids = $dbFindFileFlagPath->fetchall_arrayref;
163: for my $i (@$ids) {
164: my $id = $$i[0];
165: logger('DEBUG', "cleaning file and meta of id: $id");
166: $dbClearMeta->execute($id);
167: $dbClearFile->execute($id);
168: $dbh->commit;
169: }
170: }
171:
172: #
173: # harvestFile($filename, $filepath)
174: #
175: # reads the index file $filename at $filepath and puts the contents
176: # in the database.
177: #
178: sub harvestFile {
179: my ($filename, $filepath) = @_;
180: logger('DEBUG', "looking at file '$filename' at '$filepath'");
181: # get file time
182: my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
183: $atime,$mtime,$ctime,$blksize,$blocks)
184: = stat("$filepath/$filename");
185: my $filetime = stime($mtime);
186: # register file in db
187: my $fid = registerFile("$filepath/$filename", $filetime);
188: if ($fid) {
189: # file is new/modified
190: # parse index file
1.2 ! casties 191: my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
! 192: if ($@) {
! 193: my $errmsg = $@;
! 194: logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
! 195: $errcnt++;
! 196: return;
! 197: }
1.1 casties 198: my @data = $metaParserHandler->getData();
199: logger('DEBUG', "parsed $#data+1 elements");
1.2 ! casties 200: if ($data[0][0] eq "html") {
! 201: # oops, wrong
! 202: logger('WARNING', "invalid HTML content in file $filepath/$filename");
! 203: return;
! 204: }
! 205: registerMeta($fid, @data);
! 206: }
! 207: $idxcnt++;
! 208: logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
! 209: }
! 210:
! 211: #
! 212: # readURLIndex($baseurl)
! 213: #
! 214: # reads the XML index at $baseurl
! 215: # and processes all its entries
! 216: #
! 217: sub readURLIndex {
! 218: my ($baseurl) = @_;
! 219: my $cnt = 0;
! 220:
! 221: # parse index file
! 222: logger('DEBUG', "parsing $baseurl ...");
! 223: $metaParser->parse_uri($baseurl);
! 224: my @indexdata = $metaParserHandler->getData();
! 225: logger('INFO', "parsed $#indexdata+1 index entries");
! 226:
! 227: foreach my $me (@indexdata) {
! 228: $cnt++;
! 229: my ($tag, $val, $attr) = @$me;
! 230: my $meta;
! 231: my $file;
! 232: if ($tag =~ /index\/resource$/) {
! 233: if ($attr =~ /metaLink=\"([^\"]+)\"/) {
! 234: $meta = $1;
! 235: }
! 236: if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
! 237: $file = $1;
! 238: }
! 239: if ($meta =~ /^http:/) {
! 240: harvestURL($meta, $file);
! 241: }
! 242: }
! 243: }
! 244: return $cnt;
! 245: }
! 246:
! 247: #
! 248: # harvestURL($metaurl, $fileurl)
! 249: #
! 250: # reads the index file from $metaurl and puts the contents
! 251: # in the database (under $filepath)
! 252: #
! 253: sub harvestURL {
! 254: my ($metaurl, $fileurl) = @_;
! 255: logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
! 256: # try to parse index file
! 257: my $ret = eval{$metaParser->parse_uri($metaurl)};
! 258: if ($@) {
! 259: my $errmsg = $@;
! 260: logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
! 261: $errcnt++;
! 262: return;
! 263: }
! 264: my @data = $metaParserHandler->getData();
! 265: logger('DEBUG', "parsed $#data+1 elements");
! 266: if ($data[0][0] eq "html") {
! 267: # oops, wrong
! 268: logger('WARNING', "invalid HTML content from $metaurl");
! 269: $warncnt++;
! 270: return;
! 271: }
! 272: # filetime is now
! 273: my $filetime = stime(time);
! 274: # register file in db
! 275: my $fid = registerFile("$fileurl", $filetime);
! 276: if ($fid) {
! 277: # file is new/modified
1.1 casties 278: registerMeta($fid, @data);
279: }
280: $idxcnt++;
281: logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
282: }
283:
1.2 ! casties 284:
1.1 casties 285: #
286: # $fileid = registerFile($filepath, $filetime)
287: #
288: # returns the file ID for the file $filepath. If necessary it
289: # will be added to the database. returns 0 if an update is not necessary.
290: #
291: sub registerFile {
292: my ($filepath, $filetime) = @_;
293: my $fileid = 0;
294: # look if file is in db
295: my $rv = $dbFindFileName->execute($filepath);
296: my $mtime;
297: ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
298: if ($fileid) {
299: # file is in db
300: # update flag
301: $dbSetFileFlag->execute($fileid, 1);
302: $dbh->commit;
303: my $stime = s2stime($mtime);
304: if ($stime ge $filetime) {
305: # if its current return 0
306: logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
307: return 0;
308: } else {
309: logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
310: }
311: }
312: if (! $fileid) {
313: # get a new file id
314: my $rv = $dbNextFileId->execute;
315: ($fileid) = $dbNextFileId->fetchrow_array;
316: logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
317: $dbNewFile->execute($fileid, $filepath, $filetime);
318: # update flag
319: $dbSetFileFlag->execute($fileid, 1);
320: $dbh->commit;
321: }
322: return $fileid;
323: }
324:
325: #
326: # registerMeta($fileid, @meta)
327: #
328: # adds the metadata information @meta for $fileid to the database.
329: #
330: sub registerMeta {
331: my ($fileid, @meta) = @_;
332: logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
1.2 ! casties 333: # clear out old data
! 334: $dbClearMeta->execute($fileid);
1.1 casties 335: my $idx = 0;
336: foreach my $keyval (@meta) {
337: #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
338: $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
339: }
340: $dbh->commit;
341: logger('INFO', "added $idx elements (file $fileid)");
342: }
343:
344: #
345: # initdb()
346: #
347: # initialises the database connection.
348: #
349: sub initDB {
350: my $rv;
351: # clean tables
352: if ($purgeDB) {
353: $rv = $dbh->do("delete from files");
354: $rv = $dbh->do("delete from meta");
355: if ($dbh->err) {
356: logger('ABORT', "unable to clean table!");
357: exit 1;
358: }
359: $dbh->commit;
360: }
361:
362: # clear flags
363: $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
364: $dbh->commit;
365:
366: # prepare statements
367: $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
368: $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
369: $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
370: $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
371: $dbClearFile = $dbh->prepare("delete from files where id=?");
372: $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
373: $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
374: $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
375: $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
376: $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
377:
378: }
379:
380: #######################################################
381: # main
382: #
383:
384: logger("INFO", "harvestmeta $version");
385:
386: initDB();
1.2 ! casties 387: my $fnum;
1.1 casties 388:
1.2 ! casties 389: if ($basedir) {
! 390: # read and process all files under $basedir
! 391: $fnum = readAllFiles($basedir, "");
! 392: # delete orphaned data (under $basedir)
! 393: cleanUnmarkedFiles($basedir);
! 394: } elsif ($baseurl) {
! 395: # read and process XML index
! 396: $fnum = readURLIndex($indexurl);
! 397: if ($baseurl) {
! 398: # delete orphaned data (under $baseurl)
! 399: cleanUnmarkedFiles($baseurl);
! 400: }
! 401: } elsif ($singleurl) {
! 402: # read and process single XML url
! 403: harvestURL($singleurl, $singleurl);
! 404: $fnum = 1;
! 405: if ($baseurl) {
! 406: # delete orphaned data (under $baseurl)
! 407: cleanUnmarkedFiles($baseurl);
! 408: }
! 409: }
1.1 casties 410:
411: logger("INFO", "analysed $idxcnt of $fnum files!");
412: logger("INFO", "$warncnt warnings");
413: logger("INFO", "$errcnt errors");
414: if ($errcnt > 0) {
415: logger("ABORT", "there were errors!");
416: exit 1;
417: } else {
418: logger("DONE", "all index files read successfully!");
419: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>