annotate harvestmeta.pl @ 6:a3feffd94021

small fixes
author casties
date Tue, 13 Jul 2004 20:32:49 +0200
parents 046d584ed7b3
children 65895eec9e30
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
1 #!/usr/local/bin/perl -w
30497c6a3eca Initial revision
casties
parents:
diff changeset
2
30497c6a3eca Initial revision
casties
parents:
diff changeset
3 use strict;
30497c6a3eca Initial revision
casties
parents:
diff changeset
4 use XML::SAX;
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
5 use XML::LibXML;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
6 use DBI;
30497c6a3eca Initial revision
casties
parents:
diff changeset
7
4
046d584ed7b3 forgot lib path...
casties
parents: 3
diff changeset
8 use lib '/usr/local/mpiwg/archive';
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
9 use MPIWGStor;
30497c6a3eca Initial revision
casties
parents:
diff changeset
10 use HarvestmetaHandler;
30497c6a3eca Initial revision
casties
parents:
diff changeset
11
30497c6a3eca Initial revision
casties
parents:
diff changeset
12 # make output unbuffered
30497c6a3eca Initial revision
casties
parents:
diff changeset
13 $|=1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
14
30497c6a3eca Initial revision
casties
parents:
diff changeset
15 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
16 # internal parameters
30497c6a3eca Initial revision
casties
parents:
diff changeset
17 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
18
30497c6a3eca Initial revision
casties
parents:
diff changeset
19 # program version
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
20 my $version = "0.2.1 (13.07.2004)";
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
21
30497c6a3eca Initial revision
casties
parents:
diff changeset
22 # read command line parameters
30497c6a3eca Initial revision
casties
parents:
diff changeset
23 my $args = MPIWGStor::parseargs;
30497c6a3eca Initial revision
casties
parents:
diff changeset
24
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
25 if (! scalar(%$args)) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
26 print "harvestmeta $version\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
27 print "use: harvestmeta -path=dir\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
28 print " reads all metadata info from directory dir into the database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
29 print "alternative sources:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
30 print " -indexurl=url : read XML index and follow links\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
31 print " -singleurl=url : read single index file\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
32 print "additional options:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
33 print " -baseurl=url : clean all URL sources relative to this base\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
34 print " -debug : output debugging info\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
35 print " -purgedb : clear whole database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
36 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
37 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
38
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
39 # debug level
30497c6a3eca Initial revision
casties
parents:
diff changeset
40 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
41
30497c6a3eca Initial revision
casties
parents:
diff changeset
42 # XML namespace (not really implemented!)
30497c6a3eca Initial revision
casties
parents:
diff changeset
43 my $namespace = "";
30497c6a3eca Initial revision
casties
parents:
diff changeset
44
30497c6a3eca Initial revision
casties
parents:
diff changeset
45 # delete and rebuild database
30497c6a3eca Initial revision
casties
parents:
diff changeset
46 my $purgeDB = (exists $$args{'purgedb'});
30497c6a3eca Initial revision
casties
parents:
diff changeset
47
30497c6a3eca Initial revision
casties
parents:
diff changeset
48 # database connection
30497c6a3eca Initial revision
casties
parents:
diff changeset
49 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
30497c6a3eca Initial revision
casties
parents:
diff changeset
50 if (! $dbh) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
51 logger('ABORT', "unable to connect to database!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
52 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
53 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
54 $dbh->{AutoCommit} = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
55 my $dbNextFileId;
30497c6a3eca Initial revision
casties
parents:
diff changeset
56 my $dbNewFile;
30497c6a3eca Initial revision
casties
parents:
diff changeset
57 my $dbNewMeta;
30497c6a3eca Initial revision
casties
parents:
diff changeset
58 my $dbClearMeta;
30497c6a3eca Initial revision
casties
parents:
diff changeset
59 my $dbFindFileName;
30497c6a3eca Initial revision
casties
parents:
diff changeset
60 my $dbFindFilePath;
30497c6a3eca Initial revision
casties
parents:
diff changeset
61 my $dbClearFile;
30497c6a3eca Initial revision
casties
parents:
diff changeset
62 my $dbFindFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
63 my $dbFindFileFlagPath;
30497c6a3eca Initial revision
casties
parents:
diff changeset
64 my $dbSetFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
65 my $dbClearAllFileFlag;
30497c6a3eca Initial revision
casties
parents:
diff changeset
66
30497c6a3eca Initial revision
casties
parents:
diff changeset
67 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
68 # check parameters that were passed to the program
30497c6a3eca Initial revision
casties
parents:
diff changeset
69 #
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
70 my $baseurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
71 my $indexurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
72 my $singleurl;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
73 my $basedir = $$args{'path'};
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
74 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
75 # strip trailing slashes
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
76 $basedir =~ s/\/$//;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
77 if (! -d $basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
78 logger("ABORT", "document directory \'$basedir\' doesn't exist!");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
79 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
80 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
81 } else {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
82 # use URL
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
83 $baseurl = $$args{'baseurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
84 $indexurl = $$args{'indexurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
85 $singleurl = $$args{'url'};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
86 if (! (($indexurl)||($singleurl))) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
87 logger("ABORT", "no document source given!");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
88 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
89 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
90 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
91
30497c6a3eca Initial revision
casties
parents:
diff changeset
92 my $metaParserHandler = HarvestmetaHandler->new;
30497c6a3eca Initial revision
casties
parents:
diff changeset
93 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
30497c6a3eca Initial revision
casties
parents:
diff changeset
94
30497c6a3eca Initial revision
casties
parents:
diff changeset
95 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
96 # internal variables
30497c6a3eca Initial revision
casties
parents:
diff changeset
97 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
98
30497c6a3eca Initial revision
casties
parents:
diff changeset
99 # number of errors
30497c6a3eca Initial revision
casties
parents:
diff changeset
100 my $errcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
101 # number of warnings
30497c6a3eca Initial revision
casties
parents:
diff changeset
102 my $warncnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
103
30497c6a3eca Initial revision
casties
parents:
diff changeset
104 # number of files on fs
30497c6a3eca Initial revision
casties
parents:
diff changeset
105 my $fcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
106 # number of index files
30497c6a3eca Initial revision
casties
parents:
diff changeset
107 my $idxcnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
108
30497c6a3eca Initial revision
casties
parents:
diff changeset
109 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
110 # subroutines
30497c6a3eca Initial revision
casties
parents:
diff changeset
111 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
112
30497c6a3eca Initial revision
casties
parents:
diff changeset
113 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
114 # readAllFiles($realdir, $basedir, \%files, \%dirs)
30497c6a3eca Initial revision
casties
parents:
diff changeset
115 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
116 # reads all files and directories below $realdir and puts the
30497c6a3eca Initial revision
casties
parents:
diff changeset
117 # files in %files and directories in %dirs
30497c6a3eca Initial revision
casties
parents:
diff changeset
118 # $basedir is only for recursion, it should be empty when called
30497c6a3eca Initial revision
casties
parents:
diff changeset
119 # from outside
30497c6a3eca Initial revision
casties
parents:
diff changeset
120 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
121 sub readAllFiles {
30497c6a3eca Initial revision
casties
parents:
diff changeset
122 my ($directory, $basedir) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
123 my $cnt = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
124
30497c6a3eca Initial revision
casties
parents:
diff changeset
125 if (! opendir DIR, $directory) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
126 return 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
127 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
128 my @dirfiles = readdir DIR;
30497c6a3eca Initial revision
casties
parents:
diff changeset
129 foreach my $fn (@dirfiles) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
130 # ignore names starting with a dot
30497c6a3eca Initial revision
casties
parents:
diff changeset
131 next if ($fn =~ /^\./);
30497c6a3eca Initial revision
casties
parents:
diff changeset
132 # ignore other silly files
30497c6a3eca Initial revision
casties
parents:
diff changeset
133 next if ($junk_files{$fn});
30497c6a3eca Initial revision
casties
parents:
diff changeset
134
30497c6a3eca Initial revision
casties
parents:
diff changeset
135 $cnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
136 $fcnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
137 my $f = "$directory/$fn";
30497c6a3eca Initial revision
casties
parents:
diff changeset
138 my $docf = ($basedir) ? "$basedir/$fn" : $fn;
30497c6a3eca Initial revision
casties
parents:
diff changeset
139 #logger('DEBUG', "fs_file: \"$f\"");
30497c6a3eca Initial revision
casties
parents:
diff changeset
140 if (-f $f) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
141 #logger(" is file");
30497c6a3eca Initial revision
casties
parents:
diff changeset
142 if ($fn eq "index.meta") {
30497c6a3eca Initial revision
casties
parents:
diff changeset
143 harvestFile($fn, $directory);
30497c6a3eca Initial revision
casties
parents:
diff changeset
144 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
145 } elsif (-d _) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
146 #logger(" is dir");
30497c6a3eca Initial revision
casties
parents:
diff changeset
147 # recurse into directory
30497c6a3eca Initial revision
casties
parents:
diff changeset
148 $cnt += readAllFiles($f, $docf);
30497c6a3eca Initial revision
casties
parents:
diff changeset
149 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
150 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
151 return $cnt;
30497c6a3eca Initial revision
casties
parents:
diff changeset
152 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
153
30497c6a3eca Initial revision
casties
parents:
diff changeset
154 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
155 # cleanUnmarkedFiles($basepath)
30497c6a3eca Initial revision
casties
parents:
diff changeset
156 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
157 # deletes all unflagged file and meta entries.
30497c6a3eca Initial revision
casties
parents:
diff changeset
158 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
159 sub cleanUnmarkedFiles {
30497c6a3eca Initial revision
casties
parents:
diff changeset
160 my ($basepath) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
161 my $rv = $dbFindFileFlagPath->execute("${basepath}%");
30497c6a3eca Initial revision
casties
parents:
diff changeset
162 my $ids = $dbFindFileFlagPath->fetchall_arrayref;
30497c6a3eca Initial revision
casties
parents:
diff changeset
163 for my $i (@$ids) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
164 my $id = $$i[0];
30497c6a3eca Initial revision
casties
parents:
diff changeset
165 logger('DEBUG', "cleaning file and meta of id: $id");
30497c6a3eca Initial revision
casties
parents:
diff changeset
166 $dbClearMeta->execute($id);
30497c6a3eca Initial revision
casties
parents:
diff changeset
167 $dbClearFile->execute($id);
30497c6a3eca Initial revision
casties
parents:
diff changeset
168 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
169 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
170 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
171
30497c6a3eca Initial revision
casties
parents:
diff changeset
172 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
173 # harvestFile($filename, $filepath)
30497c6a3eca Initial revision
casties
parents:
diff changeset
174 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
175 # reads the index file $filename at $filepath and puts the contents
30497c6a3eca Initial revision
casties
parents:
diff changeset
176 # in the database.
30497c6a3eca Initial revision
casties
parents:
diff changeset
177 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
178 sub harvestFile {
30497c6a3eca Initial revision
casties
parents:
diff changeset
179 my ($filename, $filepath) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
180 logger('DEBUG', "looking at file '$filename' at '$filepath'");
30497c6a3eca Initial revision
casties
parents:
diff changeset
181 # get file time
30497c6a3eca Initial revision
casties
parents:
diff changeset
182 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
30497c6a3eca Initial revision
casties
parents:
diff changeset
183 $atime,$mtime,$ctime,$blksize,$blocks)
30497c6a3eca Initial revision
casties
parents:
diff changeset
184 = stat("$filepath/$filename");
30497c6a3eca Initial revision
casties
parents:
diff changeset
185 my $filetime = stime($mtime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
186 # register file in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
187 my $fid = registerFile("$filepath/$filename", $filetime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
188 if ($fid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
189 # file is new/modified
30497c6a3eca Initial revision
casties
parents:
diff changeset
190 # parse index file
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
191 my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
192 if ($@) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
193 my $errmsg = $@;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
194 logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
195 $errcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
196 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
197 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
198 my @data = $metaParserHandler->getData();
30497c6a3eca Initial revision
casties
parents:
diff changeset
199 logger('DEBUG', "parsed $#data+1 elements");
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
200 if ($data[0][0] eq "html") {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
201 # oops, wrong
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
202 logger('WARNING', "invalid HTML content in file $filepath/$filename");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
203 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
204 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
205 registerMeta($fid, @data);
30497c6a3eca Initial revision
casties
parents:
diff changeset
206 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
207 $idxcnt++;
30497c6a3eca Initial revision
casties
parents:
diff changeset
208 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
30497c6a3eca Initial revision
casties
parents:
diff changeset
209 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
210
30497c6a3eca Initial revision
casties
parents:
diff changeset
211 #
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
212 # readURLIndex($baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
213 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
214 # reads the XML index at $baseurl
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
215 # and processes all its entries
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
216 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
217 sub readURLIndex {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
218 my ($baseurl) = @_;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
219 my $cnt = 0;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
220
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
221 # parse index file
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
222 logger('DEBUG', "parsing $baseurl ...");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
223 $metaParser->parse_uri($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
224 my @indexdata = $metaParserHandler->getData();
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
225 logger('INFO', "parsed $#indexdata+1 index entries");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
226
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
227 foreach my $me (@indexdata) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
228 $cnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
229 my ($tag, $val, $attr) = @$me;
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
230 my $meta = "";
a3feffd94021 small fixes
casties
parents: 4
diff changeset
231 my $file = "";
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
232 if ($tag =~ /index\/resource$/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
233 if ($attr =~ /metaLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
234 $meta = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
235 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
236 if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
237 $file = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
238 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
239 if ($meta =~ /^http:/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
240 harvestURL($meta, $file);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
241 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
242 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
243 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
244 return $cnt;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
245 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
246
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
247 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
248 # harvestURL($metaurl, $fileurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
249 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
250 # reads the index file from $metaurl and puts the contents
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
251 # in the database (under $filepath)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
252 #
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
253 sub harvestURL {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
254 my ($metaurl, $fileurl) = @_;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
255 logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
256 # try to parse index file
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
257 my $ret = eval{$metaParser->parse_uri($metaurl)};
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
258 if ($@) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
259 my $errmsg = $@;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
260 logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
261 $errcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
262 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
263 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
264 my @data = $metaParserHandler->getData();
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
265 logger('DEBUG', "parsed $#data+1 elements");
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
266 if (lc $data[0][0] eq "html") {
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
267 # oops, wrong
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
268 logger('WARNING', "invalid HTML content from $metaurl");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
269 $warncnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
270 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
271 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
272 # filetime is now
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
273 my $filetime = stime(time);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
274 # register file in db
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
275 my $fid = registerFile("$fileurl", $filetime);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
276 if ($fid) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
277 # file is new/modified
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
278 registerMeta($fid, @data);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
279 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
280 $idxcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
281 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
282 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
283
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
284
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
285 #
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
286 # $fileid = registerFile($filepath, $filetime)
30497c6a3eca Initial revision
casties
parents:
diff changeset
287 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
288 # returns the file ID for the file $filepath. If necessary it
30497c6a3eca Initial revision
casties
parents:
diff changeset
289 # will be added to the database. returns 0 if an update is not necessary.
30497c6a3eca Initial revision
casties
parents:
diff changeset
290 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
291 sub registerFile {
30497c6a3eca Initial revision
casties
parents:
diff changeset
292 my ($filepath, $filetime) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
293 my $fileid = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
294 # look if file is in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
295 my $rv = $dbFindFileName->execute($filepath);
30497c6a3eca Initial revision
casties
parents:
diff changeset
296 my $mtime;
30497c6a3eca Initial revision
casties
parents:
diff changeset
297 ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
30497c6a3eca Initial revision
casties
parents:
diff changeset
298 if ($fileid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
299 # file is in db
30497c6a3eca Initial revision
casties
parents:
diff changeset
300 # update flag
30497c6a3eca Initial revision
casties
parents:
diff changeset
301 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision
casties
parents:
diff changeset
302 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
303 my $stime = s2stime($mtime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
304 if ($stime ge $filetime) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
305 # if its current return 0
30497c6a3eca Initial revision
casties
parents:
diff changeset
306 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
307 return 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
308 } else {
30497c6a3eca Initial revision
casties
parents:
diff changeset
309 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
310 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
311 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
312 if (! $fileid) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
313 # get a new file id
30497c6a3eca Initial revision
casties
parents:
diff changeset
314 my $rv = $dbNextFileId->execute;
30497c6a3eca Initial revision
casties
parents:
diff changeset
315 ($fileid) = $dbNextFileId->fetchrow_array;
30497c6a3eca Initial revision
casties
parents:
diff changeset
316 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
30497c6a3eca Initial revision
casties
parents:
diff changeset
317 $dbNewFile->execute($fileid, $filepath, $filetime);
30497c6a3eca Initial revision
casties
parents:
diff changeset
318 # update flag
30497c6a3eca Initial revision
casties
parents:
diff changeset
319 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision
casties
parents:
diff changeset
320 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
321 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
322 return $fileid;
30497c6a3eca Initial revision
casties
parents:
diff changeset
323 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
324
30497c6a3eca Initial revision
casties
parents:
diff changeset
325 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
326 # registerMeta($fileid, @meta)
30497c6a3eca Initial revision
casties
parents:
diff changeset
327 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
328 # adds the metadata information @meta for $fileid to the database.
30497c6a3eca Initial revision
casties
parents:
diff changeset
329 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
330 sub registerMeta {
30497c6a3eca Initial revision
casties
parents:
diff changeset
331 my ($fileid, @meta) = @_;
30497c6a3eca Initial revision
casties
parents:
diff changeset
332 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
333 # clear out old data
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
334 $dbClearMeta->execute($fileid);
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
335 my $idx = 0;
30497c6a3eca Initial revision
casties
parents:
diff changeset
336 foreach my $keyval (@meta) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
337 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
30497c6a3eca Initial revision
casties
parents:
diff changeset
338 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
30497c6a3eca Initial revision
casties
parents:
diff changeset
339 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
340 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
341 logger('INFO', "added $idx elements (file $fileid)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
342 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
343
30497c6a3eca Initial revision
casties
parents:
diff changeset
344 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
345 # initdb()
30497c6a3eca Initial revision
casties
parents:
diff changeset
346 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
347 # initialises the database connection.
30497c6a3eca Initial revision
casties
parents:
diff changeset
348 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
349 sub initDB {
30497c6a3eca Initial revision
casties
parents:
diff changeset
350 my $rv;
30497c6a3eca Initial revision
casties
parents:
diff changeset
351 # clean tables
30497c6a3eca Initial revision
casties
parents:
diff changeset
352 if ($purgeDB) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
353 $rv = $dbh->do("delete from files");
30497c6a3eca Initial revision
casties
parents:
diff changeset
354 $rv = $dbh->do("delete from meta");
30497c6a3eca Initial revision
casties
parents:
diff changeset
355 if ($dbh->err) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
356 logger('ABORT', "unable to clean table!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
357 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
358 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
359 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
360 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
361
30497c6a3eca Initial revision
casties
parents:
diff changeset
362 # clear flags
30497c6a3eca Initial revision
casties
parents:
diff changeset
363 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
30497c6a3eca Initial revision
casties
parents:
diff changeset
364 $dbh->commit;
30497c6a3eca Initial revision
casties
parents:
diff changeset
365
30497c6a3eca Initial revision
casties
parents:
diff changeset
366 # prepare statements
30497c6a3eca Initial revision
casties
parents:
diff changeset
367 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
30497c6a3eca Initial revision
casties
parents:
diff changeset
368 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
369 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
370 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
371 $dbClearFile = $dbh->prepare("delete from files where id=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
372 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
373 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
30497c6a3eca Initial revision
casties
parents:
diff changeset
374 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
375 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
30497c6a3eca Initial revision
casties
parents:
diff changeset
376 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
30497c6a3eca Initial revision
casties
parents:
diff changeset
377
30497c6a3eca Initial revision
casties
parents:
diff changeset
378 }
30497c6a3eca Initial revision
casties
parents:
diff changeset
379
30497c6a3eca Initial revision
casties
parents:
diff changeset
380 #######################################################
30497c6a3eca Initial revision
casties
parents:
diff changeset
381 # main
30497c6a3eca Initial revision
casties
parents:
diff changeset
382 #
30497c6a3eca Initial revision
casties
parents:
diff changeset
383
30497c6a3eca Initial revision
casties
parents:
diff changeset
384 logger("INFO", "harvestmeta $version");
30497c6a3eca Initial revision
casties
parents:
diff changeset
385
30497c6a3eca Initial revision
casties
parents:
diff changeset
386 initDB();
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
387 my $fnum = 0;
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
388
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
389 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
390 # read and process all files under $basedir
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
391 $fnum = readAllFiles($basedir, "");
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
392 # delete orphaned data (under $basedir)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
393 cleanUnmarkedFiles($basedir);
6
a3feffd94021 small fixes
casties
parents: 4
diff changeset
394 } elsif ($indexurl) {
3
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
395 # read and process XML index
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
396 $fnum = readURLIndex($indexurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
397 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
398 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
399 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
400 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
401 } elsif ($singleurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
402 # read and process single XML url
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
403 harvestURL($singleurl, $singleurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
404 $fnum = 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
405 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
406 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
407 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
408 }
1a51f94d5dbd new version also reads XML index feeds via HTTP
casties
parents: 0
diff changeset
409 }
0
30497c6a3eca Initial revision
casties
parents:
diff changeset
410
30497c6a3eca Initial revision
casties
parents:
diff changeset
411 logger("INFO", "analysed $idxcnt of $fnum files!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
412 logger("INFO", "$warncnt warnings");
30497c6a3eca Initial revision
casties
parents:
diff changeset
413 logger("INFO", "$errcnt errors");
30497c6a3eca Initial revision
casties
parents:
diff changeset
414 if ($errcnt > 0) {
30497c6a3eca Initial revision
casties
parents:
diff changeset
415 logger("ABORT", "there were errors!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
416 exit 1;
30497c6a3eca Initial revision
casties
parents:
diff changeset
417 } else {
30497c6a3eca Initial revision
casties
parents:
diff changeset
418 logger("DONE", "all index files read successfully!");
30497c6a3eca Initial revision
casties
parents:
diff changeset
419 }