comparison harvestmeta.pl @ 0:30497c6a3eca

Initial revision
author casties
date Thu, 17 Jun 2004 17:58:42 +0200
parents
children 1a51f94d5dbd
comparison
equal deleted inserted replaced
-1:000000000000 0:30497c6a3eca
1 #!/usr/local/bin/perl -w
2
3 use strict;
4 use XML::SAX;
5 use DBI;
6
7 use lib '/usr/local/mpiwg/archive';
8 use MPIWGStor;
9 use HarvestmetaHandler;
10
11 # make output unbuffered
12 $|=1;
13
14 #######################################################
15 # internal parameters
16 #
17
18 # program version
19 my $version = "0.1 (08.06.2004)";
20
21 # read command line parameters
22 my $args = MPIWGStor::parseargs;
23
24 # debug level
25 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
26
27 # XML namespace (not really implemented!)
28 my $namespace = "";
29
30 # delete and rebuild database
31 my $purgeDB = (exists $$args{'purgedb'});
32
33 # database connection
34 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
35 if (! $dbh) {
36 logger('ABORT', "unable to connect to database!");
37 exit 1;
38 }
39 $dbh->{AutoCommit} = 0;
40 my $dbNextFileId;
41 my $dbNewFile;
42 my $dbNewMeta;
43 my $dbClearMeta;
44 my $dbFindFileName;
45 my $dbFindFilePath;
46 my $dbClearFile;
47 my $dbFindFileFlag;
48 my $dbFindFileFlagPath;
49 my $dbSetFileFlag;
50 my $dbClearAllFileFlag;
51
52 #######################################################
53 # check parameters that were passed to the program
54 #
55 my $basedir = $$args{'path'};
56 if (! $basedir) {
57 logger("ABORT", "no document directory given!");
58 exit 1;
59 }
60 # strip trailing slashes
61 $basedir =~ s/\/$//;
62 if (! -d $basedir) {
63 logger("ABORT", "document directory \'$basedir\' doesn't exist!");
64 exit 1;
65 }
66
67 my $metaParserHandler = HarvestmetaHandler->new;
68 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
69
70 #######################################################
71 # internal variables
72 #
73
74 # number of errors
75 my $errcnt = 0;
76 # number of warnings
77 my $warncnt = 0;
78
79 # number of files on fs
80 my $fcnt = 0;
81 # number of index files
82 my $idxcnt = 0;
83
84 #######################################################
85 # subroutines
86 #
87
88 #
89 # readAllFiles($realdir, $basedir, \%files, \%dirs)
90 #
91 # reads all files and directories below $realdir and puts the
92 # files in %files and directories in %dirs
93 # $basedir is only for recursion, it should be empty when called
94 # from outside
95 #
96 sub readAllFiles {
97 my ($directory, $basedir) = @_;
98 my $cnt = 0;
99
100 if (! opendir DIR, $directory) {
101 return 0;
102 }
103 my @dirfiles = readdir DIR;
104 foreach my $fn (@dirfiles) {
105 # ignore names starting with a dot
106 next if ($fn =~ /^\./);
107 # ignore other silly files
108 next if ($junk_files{$fn});
109
110 $cnt++;
111 $fcnt++;
112 my $f = "$directory/$fn";
113 my $docf = ($basedir) ? "$basedir/$fn" : $fn;
114 #logger('DEBUG', "fs_file: \"$f\"");
115 if (-f $f) {
116 #logger(" is file");
117 if ($fn eq "index.meta") {
118 harvestFile($fn, $directory);
119 }
120 } elsif (-d _) {
121 #logger(" is dir");
122 # recurse into directory
123 $cnt += readAllFiles($f, $docf);
124 }
125 }
126 return $cnt;
127 }
128
129 #
130 # cleanUnmarkedFiles($basepath)
131 #
132 # deletes all unflagged file and meta entries.
133 #
134 sub cleanUnmarkedFiles {
135 my ($basepath) = @_;
136 my $rv = $dbFindFileFlagPath->execute("${basepath}%");
137 my $ids = $dbFindFileFlagPath->fetchall_arrayref;
138 for my $i (@$ids) {
139 my $id = $$i[0];
140 logger('DEBUG', "cleaning file and meta of id: $id");
141 $dbClearMeta->execute($id);
142 $dbClearFile->execute($id);
143 $dbh->commit;
144 }
145 }
146
147 #
148 # harvestFile($filename, $filepath)
149 #
150 # reads the index file $filename at $filepath and puts the contents
151 # in the database.
152 #
153 sub harvestFile {
154 my ($filename, $filepath) = @_;
155 logger('DEBUG', "looking at file '$filename' at '$filepath'");
156 # get file time
157 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
158 $atime,$mtime,$ctime,$blksize,$blocks)
159 = stat("$filepath/$filename");
160 my $filetime = stime($mtime);
161 # register file in db
162 my $fid = registerFile("$filepath/$filename", $filetime);
163 if ($fid) {
164 # file is new/modified
165 # parse index file
166 $metaParser->parse_uri("$filepath/$filename");
167 my @data = $metaParserHandler->getData();
168 logger('DEBUG', "parsed $#data+1 elements");
169 registerMeta($fid, @data);
170 }
171 $idxcnt++;
172 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
173 }
174
175 #
176 # $fileid = registerFile($filepath, $filetime)
177 #
178 # returns the file ID for the file $filepath. If necessary it
179 # will be added to the database. returns 0 if an update is not necessary.
180 #
181 sub registerFile {
182 my ($filepath, $filetime) = @_;
183 my $fileid = 0;
184 # look if file is in db
185 my $rv = $dbFindFileName->execute($filepath);
186 my $mtime;
187 ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
188 if ($fileid) {
189 # file is in db
190 # update flag
191 $dbSetFileFlag->execute($fileid, 1);
192 $dbh->commit;
193 my $stime = s2stime($mtime);
194 if ($stime ge $filetime) {
195 # if its current return 0
196 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
197 return 0;
198 } else {
199 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
200 }
201 }
202 if (! $fileid) {
203 # get a new file id
204 my $rv = $dbNextFileId->execute;
205 ($fileid) = $dbNextFileId->fetchrow_array;
206 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
207 $dbNewFile->execute($fileid, $filepath, $filetime);
208 # update flag
209 $dbSetFileFlag->execute($fileid, 1);
210 $dbh->commit;
211 }
212 return $fileid;
213 }
214
215 #
216 # registerMeta($fileid, @meta)
217 #
218 # adds the metadata information @meta for $fileid to the database.
219 #
220 sub registerMeta {
221 my ($fileid, @meta) = @_;
222 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
223 my $idx = 0;
224 foreach my $keyval (@meta) {
225 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
226 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
227 }
228 $dbh->commit;
229 logger('INFO', "added $idx elements (file $fileid)");
230 }
231
232 #
233 # initdb()
234 #
235 # initialises the database connection.
236 #
237 sub initDB {
238 my $rv;
239 # clean tables
240 if ($purgeDB) {
241 $rv = $dbh->do("delete from files");
242 $rv = $dbh->do("delete from meta");
243 if ($dbh->err) {
244 logger('ABORT', "unable to clean table!");
245 exit 1;
246 }
247 $dbh->commit;
248 }
249
250 # clear flags
251 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
252 $dbh->commit;
253
254 # prepare statements
255 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
256 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
257 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
258 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
259 $dbClearFile = $dbh->prepare("delete from files where id=?");
260 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
261 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
262 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
263 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
264 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
265
266 }
267
268 #######################################################
269 # main
270 #
271
272 logger("INFO", "harvestmeta $version");
273
274 initDB();
275
276 # read and process all files under $basedir
277 my $fnum = readAllFiles($basedir, "");
278 # delete orphaned data (under $basedir)
279 cleanUnmarkedFiles($basedir);
280
281 logger("INFO", "analysed $idxcnt of $fnum files!");
282 logger("INFO", "$warncnt warnings");
283 logger("INFO", "$errcnt errors");
284 if ($errcnt > 0) {
285 logger("ABORT", "there were errors!");
286 exit 1;
287 } else {
288 logger("DONE", "all index files read successfully!");
289 }