Mercurial > hg > foxridge-archiver
comparison harvestmeta.pl @ 0:30497c6a3eca
Initial revision
author | casties |
---|---|
date | Thu, 17 Jun 2004 17:58:42 +0200 |
parents | |
children | 1a51f94d5dbd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:30497c6a3eca |
---|---|
1 #!/usr/local/bin/perl -w | |
2 | |
3 use strict; | |
4 use XML::SAX; | |
5 use DBI; | |
6 | |
7 use lib '/usr/local/mpiwg/archive'; | |
8 use MPIWGStor; | |
9 use HarvestmetaHandler; | |
10 | |
11 # make output unbuffered | |
12 $|=1; | |
13 | |
14 ####################################################### | |
15 # internal parameters | |
16 # | |
17 | |
18 # program version | |
19 my $version = "0.1 (08.06.2004)"; | |
20 | |
21 # read command line parameters | |
22 my $args = MPIWGStor::parseargs; | |
23 | |
24 # debug level | |
25 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0; | |
26 | |
27 # XML namespace (not really implemented!) | |
28 my $namespace = ""; | |
29 | |
30 # delete and rebuild database | |
31 my $purgeDB = (exists $$args{'purgedb'}); | |
32 | |
33 # database connection | |
34 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", ""); | |
35 if (! $dbh) { | |
36 logger('ABORT', "unable to connect to database!"); | |
37 exit 1; | |
38 } | |
39 $dbh->{AutoCommit} = 0; | |
40 my $dbNextFileId; | |
41 my $dbNewFile; | |
42 my $dbNewMeta; | |
43 my $dbClearMeta; | |
44 my $dbFindFileName; | |
45 my $dbFindFilePath; | |
46 my $dbClearFile; | |
47 my $dbFindFileFlag; | |
48 my $dbFindFileFlagPath; | |
49 my $dbSetFileFlag; | |
50 my $dbClearAllFileFlag; | |
51 | |
52 ####################################################### | |
53 # check parameters that were passed to the program | |
54 # | |
55 my $basedir = $$args{'path'}; | |
56 if (! $basedir) { | |
57 logger("ABORT", "no document directory given!"); | |
58 exit 1; | |
59 } | |
60 # strip trailing slashes | |
61 $basedir =~ s/\/$//; | |
62 if (! -d $basedir) { | |
63 logger("ABORT", "document directory \'$basedir\' doesn't exist!"); | |
64 exit 1; | |
65 } | |
66 | |
67 my $metaParserHandler = HarvestmetaHandler->new; | |
68 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler); | |
69 | |
70 ####################################################### | |
71 # internal variables | |
72 # | |
73 | |
74 # number of errors | |
75 my $errcnt = 0; | |
76 # number of warnings | |
77 my $warncnt = 0; | |
78 | |
79 # number of files on fs | |
80 my $fcnt = 0; | |
81 # number of index files | |
82 my $idxcnt = 0; | |
83 | |
84 ####################################################### | |
85 # subroutines | |
86 # | |
87 | |
88 # | |
89 # readAllFiles($realdir, $basedir, \%files, \%dirs) | |
90 # | |
91 # reads all files and directories below $realdir and puts the | |
92 # files in %files and directories in %dirs | |
93 # $basedir is only for recursion, it should be empty when called | |
94 # from outside | |
95 # | |
96 sub readAllFiles { | |
97 my ($directory, $basedir) = @_; | |
98 my $cnt = 0; | |
99 | |
100 if (! opendir DIR, $directory) { | |
101 return 0; | |
102 } | |
103 my @dirfiles = readdir DIR; | |
104 foreach my $fn (@dirfiles) { | |
105 # ignore names starting with a dot | |
106 next if ($fn =~ /^\./); | |
107 # ignore other silly files | |
108 next if ($junk_files{$fn}); | |
109 | |
110 $cnt++; | |
111 $fcnt++; | |
112 my $f = "$directory/$fn"; | |
113 my $docf = ($basedir) ? "$basedir/$fn" : $fn; | |
114 #logger('DEBUG', "fs_file: \"$f\""); | |
115 if (-f $f) { | |
116 #logger(" is file"); | |
117 if ($fn eq "index.meta") { | |
118 harvestFile($fn, $directory); | |
119 } | |
120 } elsif (-d _) { | |
121 #logger(" is dir"); | |
122 # recurse into directory | |
123 $cnt += readAllFiles($f, $docf); | |
124 } | |
125 } | |
126 return $cnt; | |
127 } | |
128 | |
129 # | |
130 # cleanUnmarkedFiles($basepath) | |
131 # | |
132 # deletes all unflagged file and meta entries. | |
133 # | |
134 sub cleanUnmarkedFiles { | |
135 my ($basepath) = @_; | |
136 my $rv = $dbFindFileFlagPath->execute("${basepath}%"); | |
137 my $ids = $dbFindFileFlagPath->fetchall_arrayref; | |
138 for my $i (@$ids) { | |
139 my $id = $$i[0]; | |
140 logger('DEBUG', "cleaning file and meta of id: $id"); | |
141 $dbClearMeta->execute($id); | |
142 $dbClearFile->execute($id); | |
143 $dbh->commit; | |
144 } | |
145 } | |
146 | |
147 # | |
148 # harvestFile($filename, $filepath) | |
149 # | |
150 # reads the index file $filename at $filepath and puts the contents | |
151 # in the database. | |
152 # | |
153 sub harvestFile { | |
154 my ($filename, $filepath) = @_; | |
155 logger('DEBUG', "looking at file '$filename' at '$filepath'"); | |
156 # get file time | |
157 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, | |
158 $atime,$mtime,$ctime,$blksize,$blocks) | |
159 = stat("$filepath/$filename"); | |
160 my $filetime = stime($mtime); | |
161 # register file in db | |
162 my $fid = registerFile("$filepath/$filename", $filetime); | |
163 if ($fid) { | |
164 # file is new/modified | |
165 # parse index file | |
166 $metaParser->parse_uri("$filepath/$filename"); | |
167 my @data = $metaParserHandler->getData(); | |
168 logger('DEBUG', "parsed $#data+1 elements"); | |
169 registerMeta($fid, @data); | |
170 } | |
171 $idxcnt++; | |
172 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ; | |
173 } | |
174 | |
175 # | |
176 # $fileid = registerFile($filepath, $filetime) | |
177 # | |
178 # returns the file ID for the file $filepath. If necessary it | |
179 # will be added to the database. returns 0 if an update is not necessary. | |
180 # | |
181 sub registerFile { | |
182 my ($filepath, $filetime) = @_; | |
183 my $fileid = 0; | |
184 # look if file is in db | |
185 my $rv = $dbFindFileName->execute($filepath); | |
186 my $mtime; | |
187 ($fileid, $mtime) = $dbFindFileName->fetchrow_array; | |
188 if ($fileid) { | |
189 # file is in db | |
190 # update flag | |
191 $dbSetFileFlag->execute($fileid, 1); | |
192 $dbh->commit; | |
193 my $stime = s2stime($mtime); | |
194 if ($stime ge $filetime) { | |
195 # if its current return 0 | |
196 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')"); | |
197 return 0; | |
198 } else { | |
199 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')"); | |
200 } | |
201 } | |
202 if (! $fileid) { | |
203 # get a new file id | |
204 my $rv = $dbNextFileId->execute; | |
205 ($fileid) = $dbNextFileId->fetchrow_array; | |
206 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime"); | |
207 $dbNewFile->execute($fileid, $filepath, $filetime); | |
208 # update flag | |
209 $dbSetFileFlag->execute($fileid, 1); | |
210 $dbh->commit; | |
211 } | |
212 return $fileid; | |
213 } | |
214 | |
215 # | |
216 # registerMeta($fileid, @meta) | |
217 # | |
218 # adds the metadata information @meta for $fileid to the database. | |
219 # | |
220 sub registerMeta { | |
221 my ($fileid, @meta) = @_; | |
222 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)"); | |
223 my $idx = 0; | |
224 foreach my $keyval (@meta) { | |
225 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]"); | |
226 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]); | |
227 } | |
228 $dbh->commit; | |
229 logger('INFO', "added $idx elements (file $fileid)"); | |
230 } | |
231 | |
232 # | |
233 # initdb() | |
234 # | |
235 # initialises the database connection. | |
236 # | |
237 sub initDB { | |
238 my $rv; | |
239 # clean tables | |
240 if ($purgeDB) { | |
241 $rv = $dbh->do("delete from files"); | |
242 $rv = $dbh->do("delete from meta"); | |
243 if ($dbh->err) { | |
244 logger('ABORT', "unable to clean table!"); | |
245 exit 1; | |
246 } | |
247 $dbh->commit; | |
248 } | |
249 | |
250 # clear flags | |
251 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )"); | |
252 $dbh->commit; | |
253 | |
254 # prepare statements | |
255 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')"); | |
256 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)"); | |
257 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?"); | |
258 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?"); | |
259 $dbClearFile = $dbh->prepare("delete from files where id=?"); | |
260 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?"); | |
261 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null"); | |
262 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)"); | |
263 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)"); | |
264 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?"); | |
265 | |
266 } | |
267 | |
268 ####################################################### | |
269 # main | |
270 # | |
271 | |
272 logger("INFO", "harvestmeta $version"); | |
273 | |
274 initDB(); | |
275 | |
276 # read and process all files under $basedir | |
277 my $fnum = readAllFiles($basedir, ""); | |
278 # delete orphaned data (under $basedir) | |
279 cleanUnmarkedFiles($basedir); | |
280 | |
281 logger("INFO", "analysed $idxcnt of $fnum files!"); | |
282 logger("INFO", "$warncnt warnings"); | |
283 logger("INFO", "$errcnt errors"); | |
284 if ($errcnt > 0) { | |
285 logger("ABORT", "there were errors!"); | |
286 exit 1; | |
287 } else { | |
288 logger("DONE", "all index files read successfully!"); | |
289 } |