annotate harvestmeta.pl @ 18:fdf4ceb36db1

fixed problem with dir names in metacheck new version of metacheck defaults to not change index file new version of archiver uses new version of metacheck
author casties
date Tue, 20 Sep 2005 19:24:57 +0200
parents 65895eec9e30
children 2208ed7370cb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
1 #!/usr/local/bin/perl -w
30497c6a3eca Initial revision
casties
parents:
diff changeset
2
30497c6a3eca Initial revision
casties
parents:
diff changeset
3 use strict;
30497c6a3eca Initial revision
casties
parents:
diff changeset
4 use XML::SAX;
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
5 use XML::LibXML;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
6 use DBI;
30497c6a3eca Initial revision
casties
parents:
diff changeset
7
4
046d584ed7b3 forgot lib path...
casties
parents: 3
diff changeset
8 use lib '/usr/local/mpiwg/archive';
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
9 use MPIWGStor;
30497c6a3eca Initial revision
casties
parents:
diff changeset
10 use HarvestmetaHandler;
30497c6a3eca Initial revision
casties
parents:
diff changeset
11
30497c6a3eca Initial revision
casties
parents:
diff changeset
12 # make output unbuffered
30497c6a3eca Initial revision
casties
parents:
diff changeset
13 $|=1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
14
30497c6a3eca Initial revision
casties
parents:
diff changeset
15 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
16 # internal parameters
30497c6a3eca Initial revision
casties
parents:
diff changeset
17 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
18
30497c6a3eca Initial revision
casties
parents:
diff changeset
19 # program version
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
20 my $version = "0.3 (27.9.2004)";
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
21
30497c6a3eca Initial revision
casties
parents:
diff changeset
22 # read command line parameters
30497c6a3eca Initial revision
casties
parents:
diff changeset
23 my $args = MPIWGStor::parseargs;
30497c6a3eca Initial revision
casties
parents:
diff changeset
24
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
25 if (! scalar(%$args)) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
26 print "harvestmeta $version\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
27 print "use: harvestmeta -path=dir\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
28 print " reads all metadata info from directory dir into the database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
29 print "alternative sources:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
30 print " -indexurl=url : read XML index and follow links\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
31 print " -singleurl=url : read single index file\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
32 print "additional options:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
33 print " -baseurl=url : clean all URL sources relative to this base\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
34 print " -debug : output debugging info\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
35 print " -purgedb : clear whole database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
36 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
37 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
38
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
39 # debug level
30497c6a3eca Initial revision
casties
parents:
diff changeset
40 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
41
30497c6a3eca Initial revision
casties
parents:
diff changeset
42 # XML namespace (not really implemented!)
30497c6a3eca Initial revision
casties
parents:
diff changeset
43 my $namespace = "";
30497c6a3eca Initial revision
casties
parents:
diff changeset
44
30497c6a3eca Initial revision
casties
parents:
diff changeset
45 # delete and rebuild database
30497c6a3eca Initial revision
casties
parents:
diff changeset
46 my $purgeDB = (exists $$args{'purgedb'});
30497c6a3eca Initial revision
casties
parents:
diff changeset
47
30497c6a3eca Initial revision
casties
parents:
diff changeset
48 # database connection
30497c6a3eca Initial revision
casties
parents:
diff changeset
49 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
30497c6a3eca Initial revision
casties
parents:
diff changeset
50 if (! $dbh) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
51 logger('ABORT', "unable to connect to database!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
52 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
53 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
54 $dbh->{AutoCommit} = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
55 my $dbNextFileId;
30497c6a3eca Initial revision
casties
parents:
diff changeset
56 my $dbNewFile;
30497c6a3eca Initial revision
casties
parents:
diff changeset
57 my $dbNewMeta;
30497c6a3eca Initial revision
casties
parents:
diff changeset
58 my $dbClearMeta;
30497c6a3eca Initial revision
casties
parents:
diff changeset
59 my $dbFindFileName;
30497c6a3eca Initial revision
casties
parents:
diff changeset
60 my $dbFindFilePath;
30497c6a3eca Initial revision
casties
parents:
diff changeset
61 my $dbClearFile;
30497c6a3eca Initial revision
casties
parents:
diff changeset
62 my $dbFindFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
63 my $dbFindFileFlagPath;
30497c6a3eca Initial revision
casties
parents:
diff changeset
64 my $dbSetFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
65 my $dbClearAllFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
66
30497c6a3eca Initial revision
casties
parents:
diff changeset
67 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
68 # check parameters that were passed to the program
30497c6a3eca Initial revision
casties
parents:
diff changeset
69 #
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
70 my $baseurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
71 my $indexurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
72 my $singleurl;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
73 my $basedir = $$args{'path'};
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
74 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
75 # strip trailing slashes
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
76 $basedir =~ s/\/$//;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
77 if (! -d $basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
78 logger("ABORT", "document directory \'$basedir\' doesn't exist!");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
79 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
80 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
81 } else {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
82 # use URL
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
83 $baseurl = $$args{'baseurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
84 $indexurl = $$args{'indexurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
85 $singleurl = $$args{'url'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
86 if (! (($indexurl)||($singleurl))) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
87 logger("ABORT", "no document source given!");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
88 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
89 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
90 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
91
30497c6a3eca Initial revision
casties
parents:
diff changeset
92 my $metaParserHandler = HarvestmetaHandler->new;
30497c6a3eca Initial revision
casties
parents:
diff changeset
93 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
30497c6a3eca Initial revision
casties
parents:
diff changeset
94
30497c6a3eca Initial revision
casties
parents:
diff changeset
95 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
96 # internal variables
30497c6a3eca Initial revision
casties
parents:
diff changeset
97 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
98
30497c6a3eca Initial revision
casties
parents:
diff changeset
99 # number of errors
30497c6a3eca Initial revision
casties
parents:
diff changeset
100 my $errcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
101 # number of warnings
30497c6a3eca Initial revision
casties
parents:
diff changeset
102 my $warncnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
103
30497c6a3eca Initial revision
casties
parents:
diff changeset
104 # number of files on fs
30497c6a3eca Initial revision
casties
parents:
diff changeset
105 my $fcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
106 # number of index files
30497c6a3eca Initial revision
casties
parents:
diff changeset
107 my $idxcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
108
30497c6a3eca Initial revision
casties
parents:
diff changeset
109 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
110 # subroutines
30497c6a3eca Initial revision
casties
parents:
diff changeset
111 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
112
30497c6a3eca Initial revision
casties
parents:
diff changeset
113 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
114 # readAllFiles($realdir, $basedir, \%files, \%dirs)
30497c6a3eca Initial revision
casties
parents:
diff changeset
115 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
116 # reads all files and directories below $realdir and puts the
30497c6a3eca Initial revision
casties
parents:
diff changeset
117 # files in %files and directories in %dirs
30497c6a3eca Initial revision
casties
parents:
diff changeset
118 # $basedir is only for recursion, it should be empty when called
30497c6a3eca Initial revision
casties
parents:
diff changeset
119 # from outside
30497c6a3eca Initial revision
casties
parents:
diff changeset
120 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
121 sub readAllFiles {
30497c6a3eca Initial revision
casties
parents:
diff changeset
122 my ($directory, $basedir) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
123 my $cnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
124
30497c6a3eca Initial revision
casties
parents:
diff changeset
125 if (! opendir DIR, $directory) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
126 return 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
127 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
128 my @dirfiles = readdir DIR;
30497c6a3eca Initial revision
casties
parents:
diff changeset
129 foreach my $fn (@dirfiles) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
130 # ignore names starting with a dot
30497c6a3eca Initial revision
casties
parents:
diff changeset
131 next if ($fn =~ /^\./);
30497c6a3eca Initial revision
casties
parents:
diff changeset
132 # ignore other silly files
30497c6a3eca Initial revision
casties
parents:
diff changeset
133 next if ($junk_files{$fn});
30497c6a3eca Initial revision
casties
parents:
diff changeset
134
30497c6a3eca Initial revision
casties
parents:
diff changeset
135 $cnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
136 $fcnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
137 my $f = "$directory/$fn";
30497c6a3eca Initial revision
casties
parents:
diff changeset
138 my $docf = ($basedir) ? "$basedir/$fn" : $fn;
30497c6a3eca Initial revision
casties
parents:
diff changeset
139 #logger('DEBUG', "fs_file: \"$f\"");
30497c6a3eca Initial revision
casties
parents:
diff changeset
140 if (-f $f) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
141 #logger(" is file");
30497c6a3eca Initial revision
casties
parents:
diff changeset
142 if ($fn eq "index.meta") {
30497c6a3eca Initial revision
casties
parents:
diff changeset
143 harvestFile($fn, $directory);
30497c6a3eca Initial revision
casties
parents:
diff changeset
144 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
145 } elsif (-d _) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
146 #logger(" is dir");
30497c6a3eca Initial revision
casties
parents:
diff changeset
147 # recurse into directory
30497c6a3eca Initial revision
casties
parents:
diff changeset
148 $cnt += readAllFiles($f, $docf);
30497c6a3eca Initial revision
casties
parents:
diff changeset
149 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
150 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
151 return $cnt;
30497c6a3eca Initial revision
casties
parents:
diff changeset
152 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
153
30497c6a3eca Initial revision
casties
parents:
diff changeset
154 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
155 # cleanUnmarkedFiles($basepath)
30497c6a3eca Initial revision
casties
parents:
diff changeset
156 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
157 # deletes all unflagged file and meta entries.
30497c6a3eca Initial revision
casties
parents:
diff changeset
158 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
159 sub cleanUnmarkedFiles {
30497c6a3eca Initial revision
casties
parents:
diff changeset
160 my ($basepath) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
161 my $rv = $dbFindFileFlagPath->execute("${basepath}%");
30497c6a3eca Initial revision
casties
parents:
diff changeset
162 my $ids = $dbFindFileFlagPath->fetchall_arrayref;
30497c6a3eca Initial revision
casties
parents:
diff changeset
163 for my $i (@$ids) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
164 my $id = $$i[0];
30497c6a3eca Initial revision
casties
parents:
diff changeset
165 logger('DEBUG', "cleaning file and meta of id: $id");
30497c6a3eca Initial revision
casties
parents:
diff changeset
166 $dbClearMeta->execute($id);
30497c6a3eca Initial revision
casties
parents:
diff changeset
167 $dbClearFile->execute($id);
30497c6a3eca Initial revision
casties
parents:
diff changeset
168 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
169 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
170 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
171
30497c6a3eca Initial revision
casties
parents:
diff changeset
172 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
173 # harvestFile($filename, $filepath)
30497c6a3eca Initial revision
casties
parents:
diff changeset
174 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
175 # reads the index file $filename at $filepath and puts the contents
30497c6a3eca Initial revision
casties
parents:
diff changeset
176 # in the database.
30497c6a3eca Initial revision
casties
parents:
diff changeset
177 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
178 sub harvestFile {
30497c6a3eca Initial revision
casties
parents:
diff changeset
179 my ($filename, $filepath) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
180 logger('DEBUG', "looking at file '$filename' at '$filepath'");
30497c6a3eca Initial revision
casties
parents:
diff changeset
181 # get file time
30497c6a3eca Initial revision
casties
parents:
diff changeset
182 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
30497c6a3eca Initial revision
casties
parents:
diff changeset
183 $atime,$mtime,$ctime,$blksize,$blocks)
30497c6a3eca Initial revision
casties
parents:
diff changeset
184 = stat("$filepath/$filename");
30497c6a3eca Initial revision
casties
parents:
diff changeset
185 my $filetime = stime($mtime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
186 # register file in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
187 my $fid = registerFile("$filepath/$filename", $filetime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
188 if ($fid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
189 # file is new/modified
30497c6a3eca Initial revision
casties
parents:
diff changeset
190 # parse index file
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
191 my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
192 if ($@) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
193 my $errmsg = $@;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
194 logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
195 $errcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
196 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
197 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
198 my @data = $metaParserHandler->getData();
30497c6a3eca Initial revision
casties
parents:
diff changeset
199 logger('DEBUG', "parsed $#data+1 elements");
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
200 if ($data[0][0] eq "html") {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
201 # oops, wrong
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
202 logger('WARNING', "invalid HTML content in file $filepath/$filename");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
203 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
204 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
205 registerMeta($fid, @data);
30497c6a3eca Initial revision
casties
parents:
diff changeset
206 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
207 $idxcnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
208 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
30497c6a3eca Initial revision
casties
parents:
diff changeset
209 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
210
30497c6a3eca Initial revision
casties
parents:
diff changeset
211 #
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
212 # readURLIndex($baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
213 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
214 # reads the XML index at $baseurl
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
215 # and processes all its entries
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
216 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
217 sub readURLIndex {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
218 my ($baseurl) = @_;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
219 my $cnt = 0;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
220
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
221 # parse index file
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
222 logger('DEBUG', "parsing $baseurl ...");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
223 $metaParser->parse_uri($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
224 my @indexdata = $metaParserHandler->getData();
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
225 logger('INFO', "parsed $#indexdata+1 index entries");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
226
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
227 foreach my $me (@indexdata) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
228 $cnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
229 my ($tag, $val, $attr) = @$me;
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
230 my $meta = "";
a3feffd94021 small fixes
casties
parents: 4
diff changeset
231 my $file = "";
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
232 my $mtime = "";
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
233 if ($tag =~ /index\/resource$/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
234 if ($attr =~ /metaLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
235 $meta = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
236 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
237 if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
238 $file = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
239 }
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
240 if ($attr =~ /modificationDate=\"([^\"]+)\"/) {
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
241 $mtime = $1;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
242 }
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
243 if ($meta =~ /^http:/) {
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
244 harvestURL($meta, $file, $mtime);
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
245 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
246 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
247 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
248 return $cnt;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
249 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
250
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
251 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
252 # harvestURL($metaurl, $fileurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
253 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
254 # reads the index file from $metaurl and puts the contents
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
255 # in the database (under $filepath)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
256 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
257 sub harvestURL {
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
258 my ($metaurl, $fileurl, $filetime) = @_;
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
259 logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
260 # if no filetime then now
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
261 $filetime = stime(time) unless ($filetime);
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
262 # register file in db
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
263 my $fid = registerFile("$fileurl", $filetime);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
264 if ($fid) {
8
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
265 # try to parse index file
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
266 my $ret = eval{$metaParser->parse_uri($metaurl)};
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
267 if ($@) {
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
268 my $errmsg = $@;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
269 logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
270 $errcnt++;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
271 return;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
272 }
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
273 my @data = $metaParserHandler->getData();
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
274 logger('DEBUG', "parsed $#data+1 elements");
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
275 if (lc $data[0][0] eq "html") {
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
276 # oops, wrong
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
277 logger('WARNING', "invalid HTML content from $metaurl");
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
278 $warncnt++;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
279 return;
65895eec9e30 Added handling of modification date for HTTP feeds.
casties
parents: 6
diff changeset
280 }
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
281 # file is new/modified
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
282 registerMeta($fid, @data);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
283 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
284 $idxcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
285 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
286 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
287
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
288
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
289 #
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
290 # $fileid = registerFile($filepath, $filetime)
30497c6a3eca Initial revision
casties
parents:
diff changeset
291 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
292 # returns the file ID for the file $filepath. If necessary it
30497c6a3eca Initial revision
casties
parents:
diff changeset
293 # will be added to the database. returns 0 if an update is not necessary.
30497c6a3eca Initial revision
casties
parents:
diff changeset
294 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
295 sub registerFile {
30497c6a3eca Initial revision
casties
parents:
diff changeset
296 my ($filepath, $filetime) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
297 my $fileid = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
298 # look if file is in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
299 my $rv = $dbFindFileName->execute($filepath);
30497c6a3eca Initial revision
casties
parents:
diff changeset
300 my $mtime;
30497c6a3eca Initial revision
casties
parents:
diff changeset
301 ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
30497c6a3eca Initial revision
casties
parents:
diff changeset
302 if ($fileid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
303 # file is in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
304 # update flag
30497c6a3eca Initial revision
casties
parents:
diff changeset
305 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision
casties
parents:
diff changeset
306 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
307 my $stime = s2stime($mtime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
308 if ($stime ge $filetime) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
309 # if its current return 0
30497c6a3eca Initial revision
casties
parents:
diff changeset
310 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
311 return 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
312 } else {
30497c6a3eca Initial revision
casties
parents:
diff changeset
313 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
314 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
315 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
316 if (! $fileid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
317 # get a new file id
30497c6a3eca Initial revision
casties
parents:
diff changeset
318 my $rv = $dbNextFileId->execute;
30497c6a3eca Initial revision
casties
parents:
diff changeset
319 ($fileid) = $dbNextFileId->fetchrow_array;
30497c6a3eca Initial revision
casties
parents:
diff changeset
320 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
30497c6a3eca Initial revision
casties
parents:
diff changeset
321 $dbNewFile->execute($fileid, $filepath, $filetime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
322 # update flag
30497c6a3eca Initial revision
casties
parents:
diff changeset
323 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision
casties
parents:
diff changeset
324 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
325 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
326 return $fileid;
30497c6a3eca Initial revision
casties
parents:
diff changeset
327 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
328
30497c6a3eca Initial revision
casties
parents:
diff changeset
329 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
330 # registerMeta($fileid, @meta)
30497c6a3eca Initial revision
casties
parents:
diff changeset
331 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
332 # adds the metadata information @meta for $fileid to the database.
30497c6a3eca Initial revision
casties
parents:
diff changeset
333 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
334 sub registerMeta {
30497c6a3eca Initial revision
casties
parents:
diff changeset
335 my ($fileid, @meta) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
336 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
337 # clear out old data
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
338 $dbClearMeta->execute($fileid);
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
339 my $idx = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
340 foreach my $keyval (@meta) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
341 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
30497c6a3eca Initial revision
casties
parents:
diff changeset
342 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
30497c6a3eca Initial revision
casties
parents:
diff changeset
343 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
344 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
345 logger('INFO', "added $idx elements (file $fileid)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
346 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
347
30497c6a3eca Initial revision
casties
parents:
diff changeset
348 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
349 # initdb()
30497c6a3eca Initial revision
casties
parents:
diff changeset
350 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
351 # initialises the database connection.
30497c6a3eca Initial revision
casties
parents:
diff changeset
352 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
353 sub initDB {
30497c6a3eca Initial revision
casties
parents:
diff changeset
354 my $rv;
30497c6a3eca Initial revision
casties
parents:
diff changeset
355 # clean tables
30497c6a3eca Initial revision
casties
parents:
diff changeset
356 if ($purgeDB) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
357 $rv = $dbh->do("delete from files");
30497c6a3eca Initial revision
casties
parents:
diff changeset
358 $rv = $dbh->do("delete from meta");
30497c6a3eca Initial revision
casties
parents:
diff changeset
359 if ($dbh->err) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
360 logger('ABORT', "unable to clean table!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
361 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
362 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
363 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
364 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
365
30497c6a3eca Initial revision
casties
parents:
diff changeset
366 # clear flags
30497c6a3eca Initial revision
casties
parents:
diff changeset
367 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
30497c6a3eca Initial revision
casties
parents:
diff changeset
368 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
369
30497c6a3eca Initial revision
casties
parents:
diff changeset
370 # prepare statements
30497c6a3eca Initial revision
casties
parents:
diff changeset
371 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
372 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
373 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
374 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
375 $dbClearFile = $dbh->prepare("delete from files where id=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
376 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
377 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
30497c6a3eca Initial revision
casties
parents:
diff changeset
378 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
379 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
380 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
381
30497c6a3eca Initial revision
casties
parents:
diff changeset
382 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
383
30497c6a3eca Initial revision
casties
parents:
diff changeset
384 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
385 # main
30497c6a3eca Initial revision
casties
parents:
diff changeset
386 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
387
30497c6a3eca Initial revision
casties
parents:
diff changeset
388 logger("INFO", "harvestmeta $version");
30497c6a3eca Initial revision
casties
parents:
diff changeset
389
30497c6a3eca Initial revision
casties
parents:
diff changeset
390 initDB();
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
391 my $fnum = 0;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
392
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
393 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
394 # read and process all files under $basedir
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
395 $fnum = readAllFiles($basedir, "");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
396 # delete orphaned data (under $basedir)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
397 cleanUnmarkedFiles($basedir);
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
398 } elsif ($indexurl) {
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
399 # read and process XML index
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
400 $fnum = readURLIndex($indexurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
401 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
402 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
403 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
404 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
405 } elsif ($singleurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
406 # read and process single XML url
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
407 harvestURL($singleurl, $singleurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
408 $fnum = 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
409 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
410 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
411 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
412 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
413 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
414
30497c6a3eca Initial revision
casties
parents:
diff changeset
415 logger("INFO", "analysed $idxcnt of $fnum files!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
416 logger("INFO", "$warncnt warnings");
30497c6a3eca Initial revision
casties
parents:
diff changeset
417 logger("INFO", "$errcnt errors");
30497c6a3eca Initial revision
casties
parents:
diff changeset
418 if ($errcnt > 0) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
419 logger("ABORT", "there were errors!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
420 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
421 } else {
30497c6a3eca Initial revision
casties
parents:
diff changeset
422 logger("DONE", "all index files read successfully!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
423 }