0
|
1 #!/usr/local/bin/perl -w
|
|
2
|
|
3 use strict;
|
|
4 use XML::SAX;
|
|
5 use DBI;
|
|
6
|
|
7 use lib '/usr/local/mpiwg/archive';
|
|
8 use MPIWGStor;
|
|
9 use HarvestmetaHandler;
|
|
10
|
|
11 # make output unbuffered
|
|
12 $|=1;
|
|
13
|
|
14 #######################################################
|
|
15 # internal parameters
|
|
16 #
|
|
17
|
|
18 # program version
|
|
19 my $version = "0.1 (08.06.2004)";
|
|
20
|
|
21 # read command line parameters
|
|
22 my $args = MPIWGStor::parseargs;
|
|
23
|
|
24 # debug level
|
|
25 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
|
|
26
|
|
27 # XML namespace (not really implemented!)
|
|
28 my $namespace = "";
|
|
29
|
|
30 # delete and rebuild database
|
|
31 my $purgeDB = (exists $$args{'purgedb'});
|
|
32
|
|
33 # database connection
|
|
34 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
|
|
35 if (! $dbh) {
|
|
36 logger('ABORT', "unable to connect to database!");
|
|
37 exit 1;
|
|
38 }
|
|
39 $dbh->{AutoCommit} = 0;
|
|
40 my $dbNextFileId;
|
|
41 my $dbNewFile;
|
|
42 my $dbNewMeta;
|
|
43 my $dbClearMeta;
|
|
44 my $dbFindFileName;
|
|
45 my $dbFindFilePath;
|
|
46 my $dbClearFile;
|
|
47 my $dbFindFileFlag;
|
|
48 my $dbFindFileFlagPath;
|
|
49 my $dbSetFileFlag;
|
|
50 my $dbClearAllFileFlag;
|
|
51
|
|
52 #######################################################
|
|
53 # check parameters that were passed to the program
|
|
54 #
|
|
55 my $basedir = $$args{'path'};
|
|
56 if (! $basedir) {
|
|
57 logger("ABORT", "no document directory given!");
|
|
58 exit 1;
|
|
59 }
|
|
60 # strip trailing slashes
|
|
61 $basedir =~ s/\/$//;
|
|
62 if (! -d $basedir) {
|
|
63 logger("ABORT", "document directory \'$basedir\' doesn't exist!");
|
|
64 exit 1;
|
|
65 }
|
|
66
|
|
67 my $metaParserHandler = HarvestmetaHandler->new;
|
|
68 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
|
|
69
|
|
70 #######################################################
|
|
71 # internal variables
|
|
72 #
|
|
73
|
|
74 # number of errors
|
|
75 my $errcnt = 0;
|
|
76 # number of warnings
|
|
77 my $warncnt = 0;
|
|
78
|
|
79 # number of files on fs
|
|
80 my $fcnt = 0;
|
|
81 # number of index files
|
|
82 my $idxcnt = 0;
|
|
83
|
|
84 #######################################################
|
|
85 # subroutines
|
|
86 #
|
|
87
|
|
88 #
|
|
89 # readAllFiles($realdir, $basedir, \%files, \%dirs)
|
|
90 #
|
|
91 # reads all files and directories below $realdir and puts the
|
|
92 # files in %files and directories in %dirs
|
|
93 # $basedir is only for recursion, it should be empty when called
|
|
94 # from outside
|
|
95 #
|
|
96 sub readAllFiles {
|
|
97 my ($directory, $basedir) = @_;
|
|
98 my $cnt = 0;
|
|
99
|
|
100 if (! opendir DIR, $directory) {
|
|
101 return 0;
|
|
102 }
|
|
103 my @dirfiles = readdir DIR;
|
|
104 foreach my $fn (@dirfiles) {
|
|
105 # ignore names starting with a dot
|
|
106 next if ($fn =~ /^\./);
|
|
107 # ignore other silly files
|
|
108 next if ($junk_files{$fn});
|
|
109
|
|
110 $cnt++;
|
|
111 $fcnt++;
|
|
112 my $f = "$directory/$fn";
|
|
113 my $docf = ($basedir) ? "$basedir/$fn" : $fn;
|
|
114 #logger('DEBUG', "fs_file: \"$f\"");
|
|
115 if (-f $f) {
|
|
116 #logger(" is file");
|
|
117 if ($fn eq "index.meta") {
|
|
118 harvestFile($fn, $directory);
|
|
119 }
|
|
120 } elsif (-d _) {
|
|
121 #logger(" is dir");
|
|
122 # recurse into directory
|
|
123 $cnt += readAllFiles($f, $docf);
|
|
124 }
|
|
125 }
|
|
126 return $cnt;
|
|
127 }
|
|
128
|
|
129 #
|
|
130 # cleanUnmarkedFiles($basepath)
|
|
131 #
|
|
132 # deletes all unflagged file and meta entries.
|
|
133 #
|
|
134 sub cleanUnmarkedFiles {
|
|
135 my ($basepath) = @_;
|
|
136 my $rv = $dbFindFileFlagPath->execute("${basepath}%");
|
|
137 my $ids = $dbFindFileFlagPath->fetchall_arrayref;
|
|
138 for my $i (@$ids) {
|
|
139 my $id = $$i[0];
|
|
140 logger('DEBUG', "cleaning file and meta of id: $id");
|
|
141 $dbClearMeta->execute($id);
|
|
142 $dbClearFile->execute($id);
|
|
143 $dbh->commit;
|
|
144 }
|
|
145 }
|
|
146
|
|
147 #
|
|
148 # harvestFile($filename, $filepath)
|
|
149 #
|
|
150 # reads the index file $filename at $filepath and puts the contents
|
|
151 # in the database.
|
|
152 #
|
|
153 sub harvestFile {
|
|
154 my ($filename, $filepath) = @_;
|
|
155 logger('DEBUG', "looking at file '$filename' at '$filepath'");
|
|
156 # get file time
|
|
157 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
|
|
158 $atime,$mtime,$ctime,$blksize,$blocks)
|
|
159 = stat("$filepath/$filename");
|
|
160 my $filetime = stime($mtime);
|
|
161 # register file in db
|
|
162 my $fid = registerFile("$filepath/$filename", $filetime);
|
|
163 if ($fid) {
|
|
164 # file is new/modified
|
|
165 # parse index file
|
|
166 $metaParser->parse_uri("$filepath/$filename");
|
|
167 my @data = $metaParserHandler->getData();
|
|
168 logger('DEBUG', "parsed $#data+1 elements");
|
|
169 registerMeta($fid, @data);
|
|
170 }
|
|
171 $idxcnt++;
|
|
172 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
|
|
173 }
|
|
174
|
|
175 #
|
|
176 # $fileid = registerFile($filepath, $filetime)
|
|
177 #
|
|
178 # returns the file ID for the file $filepath. If necessary it
|
|
179 # will be added to the database. returns 0 if an update is not necessary.
|
|
180 #
|
|
181 sub registerFile {
|
|
182 my ($filepath, $filetime) = @_;
|
|
183 my $fileid = 0;
|
|
184 # look if file is in db
|
|
185 my $rv = $dbFindFileName->execute($filepath);
|
|
186 my $mtime;
|
|
187 ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
|
|
188 if ($fileid) {
|
|
189 # file is in db
|
|
190 # update flag
|
|
191 $dbSetFileFlag->execute($fileid, 1);
|
|
192 $dbh->commit;
|
|
193 my $stime = s2stime($mtime);
|
|
194 if ($stime ge $filetime) {
|
|
195 # if its current return 0
|
|
196 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
|
|
197 return 0;
|
|
198 } else {
|
|
199 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
|
|
200 }
|
|
201 }
|
|
202 if (! $fileid) {
|
|
203 # get a new file id
|
|
204 my $rv = $dbNextFileId->execute;
|
|
205 ($fileid) = $dbNextFileId->fetchrow_array;
|
|
206 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
|
|
207 $dbNewFile->execute($fileid, $filepath, $filetime);
|
|
208 # update flag
|
|
209 $dbSetFileFlag->execute($fileid, 1);
|
|
210 $dbh->commit;
|
|
211 }
|
|
212 return $fileid;
|
|
213 }
|
|
214
|
|
215 #
|
|
216 # registerMeta($fileid, @meta)
|
|
217 #
|
|
218 # adds the metadata information @meta for $fileid to the database.
|
|
219 #
|
|
220 sub registerMeta {
|
|
221 my ($fileid, @meta) = @_;
|
|
222 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
|
|
223 my $idx = 0;
|
|
224 foreach my $keyval (@meta) {
|
|
225 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
|
|
226 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
|
|
227 }
|
|
228 $dbh->commit;
|
|
229 logger('INFO', "added $idx elements (file $fileid)");
|
|
230 }
|
|
231
|
|
232 #
|
|
233 # initdb()
|
|
234 #
|
|
235 # initialises the database connection.
|
|
236 #
|
|
237 sub initDB {
|
|
238 my $rv;
|
|
239 # clean tables
|
|
240 if ($purgeDB) {
|
|
241 $rv = $dbh->do("delete from files");
|
|
242 $rv = $dbh->do("delete from meta");
|
|
243 if ($dbh->err) {
|
|
244 logger('ABORT', "unable to clean table!");
|
|
245 exit 1;
|
|
246 }
|
|
247 $dbh->commit;
|
|
248 }
|
|
249
|
|
250 # clear flags
|
|
251 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
|
|
252 $dbh->commit;
|
|
253
|
|
254 # prepare statements
|
|
255 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
|
|
256 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
|
|
257 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
|
|
258 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
|
|
259 $dbClearFile = $dbh->prepare("delete from files where id=?");
|
|
260 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
|
|
261 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
|
|
262 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
|
|
263 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
|
|
264 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
|
|
265
|
|
266 }
|
|
267
|
|
268 #######################################################
|
|
269 # main
|
|
270 #
|
|
271
|
|
272 logger("INFO", "harvestmeta $version");
|
|
273
|
|
274 initDB();
|
|
275
|
|
276 # read and process all files under $basedir
|
|
277 my $fnum = readAllFiles($basedir, "");
|
|
278 # delete orphaned data (under $basedir)
|
|
279 cleanUnmarkedFiles($basedir);
|
|
280
|
|
281 logger("INFO", "analysed $idxcnt of $fnum files!");
|
|
282 logger("INFO", "$warncnt warnings");
|
|
283 logger("INFO", "$errcnt errors");
|
|
284 if ($errcnt > 0) {
|
|
285 logger("ABORT", "there were errors!");
|
|
286 exit 1;
|
|
287 } else {
|
|
288 logger("DONE", "all index files read successfully!");
|
|
289 }
|