1: #!/usr/local/bin/perl -w
2:
3: use strict;
4: use XML::SAX;
5: use DBI;
6:
7: use lib '/usr/local/mpiwg/archive';
8: use MPIWGStor;
9: use HarvestmetaHandler;
10:
11: # make output unbuffered
12: $|=1;
13:
14: #######################################################
15: # internal parameters
16: #
17:
18: # program version
19: my $version = "0.1 (08.06.2004)";
20:
21: # read command line parameters
22: my $args = MPIWGStor::parseargs;
23:
24: # debug level
25: $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
26:
27: # XML namespace (not really implemented!)
28: my $namespace = "";
29:
30: # delete and rebuild database
31: my $purgeDB = (exists $$args{'purgedb'});
32:
33: # database connection
34: my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
35: if (! $dbh) {
36: logger('ABORT', "unable to connect to database!");
37: exit 1;
38: }
39: $dbh->{AutoCommit} = 0;
40: my $dbNextFileId;
41: my $dbNewFile;
42: my $dbNewMeta;
43: my $dbClearMeta;
44: my $dbFindFileName;
45: my $dbFindFilePath;
46: my $dbClearFile;
47: my $dbFindFileFlag;
48: my $dbFindFileFlagPath;
49: my $dbSetFileFlag;
50: my $dbClearAllFileFlag;
51:
52: #######################################################
53: # check parameters that were passed to the program
54: #
55: my $basedir = $$args{'path'};
56: if (! $basedir) {
57: logger("ABORT", "no document directory given!");
58: exit 1;
59: }
60: # strip trailing slashes
61: $basedir =~ s/\/$//;
62: if (! -d $basedir) {
63: logger("ABORT", "document directory \'$basedir\' doesn't exist!");
64: exit 1;
65: }
66:
67: my $metaParserHandler = HarvestmetaHandler->new;
68: my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
69:
70: #######################################################
71: # internal variables
72: #
73:
74: # number of errors
75: my $errcnt = 0;
76: # number of warnings
77: my $warncnt = 0;
78:
79: # number of files on fs
80: my $fcnt = 0;
81: # number of index files
82: my $idxcnt = 0;
83:
84: #######################################################
85: # subroutines
86: #
87:
88: #
89: # readAllFiles($realdir, $basedir, \%files, \%dirs)
90: #
91: # reads all files and directories below $realdir and puts the
92: # files in %files and directories in %dirs
93: # $basedir is only for recursion, it should be empty when called
94: # from outside
95: #
96: sub readAllFiles {
97: my ($directory, $basedir) = @_;
98: my $cnt = 0;
99:
100: if (! opendir DIR, $directory) {
101: return 0;
102: }
103: my @dirfiles = readdir DIR;
104: foreach my $fn (@dirfiles) {
105: # ignore names starting with a dot
106: next if ($fn =~ /^\./);
107: # ignore other silly files
108: next if ($junk_files{$fn});
109:
110: $cnt++;
111: $fcnt++;
112: my $f = "$directory/$fn";
113: my $docf = ($basedir) ? "$basedir/$fn" : $fn;
114: #logger('DEBUG', "fs_file: \"$f\"");
115: if (-f $f) {
116: #logger(" is file");
117: if ($fn eq "index.meta") {
118: harvestFile($fn, $directory);
119: }
120: } elsif (-d _) {
121: #logger(" is dir");
122: # recurse into directory
123: $cnt += readAllFiles($f, $docf);
124: }
125: }
126: return $cnt;
127: }
128:
129: #
130: # cleanUnmarkedFiles($basepath)
131: #
132: # deletes all unflagged file and meta entries.
133: #
134: sub cleanUnmarkedFiles {
135: my ($basepath) = @_;
136: my $rv = $dbFindFileFlagPath->execute("${basepath}%");
137: my $ids = $dbFindFileFlagPath->fetchall_arrayref;
138: for my $i (@$ids) {
139: my $id = $$i[0];
140: logger('DEBUG', "cleaning file and meta of id: $id");
141: $dbClearMeta->execute($id);
142: $dbClearFile->execute($id);
143: $dbh->commit;
144: }
145: }
146:
147: #
148: # harvestFile($filename, $filepath)
149: #
150: # reads the index file $filename at $filepath and puts the contents
151: # in the database.
152: #
153: sub harvestFile {
154: my ($filename, $filepath) = @_;
155: logger('DEBUG', "looking at file '$filename' at '$filepath'");
156: # get file time
157: my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
158: $atime,$mtime,$ctime,$blksize,$blocks)
159: = stat("$filepath/$filename");
160: my $filetime = stime($mtime);
161: # register file in db
162: my $fid = registerFile("$filepath/$filename", $filetime);
163: if ($fid) {
164: # file is new/modified
165: # parse index file
166: $metaParser->parse_uri("$filepath/$filename");
167: my @data = $metaParserHandler->getData();
168: logger('DEBUG', "parsed $#data+1 elements");
169: registerMeta($fid, @data);
170: }
171: $idxcnt++;
172: logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
173: }
174:
175: #
176: # $fileid = registerFile($filepath, $filetime)
177: #
178: # returns the file ID for the file $filepath. If necessary it
179: # will be added to the database. returns 0 if an update is not necessary.
180: #
181: sub registerFile {
182: my ($filepath, $filetime) = @_;
183: my $fileid = 0;
184: # look if file is in db
185: my $rv = $dbFindFileName->execute($filepath);
186: my $mtime;
187: ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
188: if ($fileid) {
189: # file is in db
190: # update flag
191: $dbSetFileFlag->execute($fileid, 1);
192: $dbh->commit;
193: my $stime = s2stime($mtime);
194: if ($stime ge $filetime) {
195: # if its current return 0
196: logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
197: return 0;
198: } else {
199: logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
200: }
201: }
202: if (! $fileid) {
203: # get a new file id
204: my $rv = $dbNextFileId->execute;
205: ($fileid) = $dbNextFileId->fetchrow_array;
206: logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
207: $dbNewFile->execute($fileid, $filepath, $filetime);
208: # update flag
209: $dbSetFileFlag->execute($fileid, 1);
210: $dbh->commit;
211: }
212: return $fileid;
213: }
214:
215: #
216: # registerMeta($fileid, @meta)
217: #
218: # adds the metadata information @meta for $fileid to the database.
219: #
220: sub registerMeta {
221: my ($fileid, @meta) = @_;
222: logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
223: my $idx = 0;
224: foreach my $keyval (@meta) {
225: #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
226: $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
227: }
228: $dbh->commit;
229: logger('INFO', "added $idx elements (file $fileid)");
230: }
231:
232: #
233: # initdb()
234: #
235: # initialises the database connection.
236: #
237: sub initDB {
238: my $rv;
239: # clean tables
240: if ($purgeDB) {
241: $rv = $dbh->do("delete from files");
242: $rv = $dbh->do("delete from meta");
243: if ($dbh->err) {
244: logger('ABORT', "unable to clean table!");
245: exit 1;
246: }
247: $dbh->commit;
248: }
249:
250: # clear flags
251: $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
252: $dbh->commit;
253:
254: # prepare statements
255: $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
256: $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
257: $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
258: $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
259: $dbClearFile = $dbh->prepare("delete from files where id=?");
260: $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
261: $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
262: $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
263: $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
264: $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
265:
266: }
267:
268: #######################################################
269: # main
270: #
271:
272: logger("INFO", "harvestmeta $version");
273:
274: initDB();
275:
276: # read and process all files under $basedir
277: my $fnum = readAllFiles($basedir, "");
278: # delete orphaned data (under $basedir)
279: cleanUnmarkedFiles($basedir);
280:
281: logger("INFO", "analysed $idxcnt of $fnum files!");
282: logger("INFO", "$warncnt warnings");
283: logger("INFO", "$errcnt errors");
284: if ($errcnt > 0) {
285: logger("ABORT", "there were errors!");
286: exit 1;
287: } else {
288: logger("DONE", "all index files read successfully!");
289: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>