foxridge-archiver: harvestmeta.pl annotate

annotate harvestmeta.pl @ 18:fdf4ceb36db1

fixed problem with dir names in metacheck new version of metacheck defaults to not change index file new version of archiver uses new version of metacheck

author	casties
date	Tue, 20 Sep 2005 19:24:57 +0200
parents	65895eec9e30
children	2208ed7370cb

rev	line source
0 30497c6a3eca Initial revision casties parents: diff changeset	1 #!/usr/local/bin/perl -w
30497c6a3eca Initial revision casties parents: diff changeset	2
30497c6a3eca Initial revision casties parents: diff changeset	3 use strict;
30497c6a3eca Initial revision casties parents: diff changeset	4 use XML::SAX;
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	5 use XML::LibXML;
0 30497c6a3eca Initial revision casties parents: diff changeset	6 use DBI;
30497c6a3eca Initial revision casties parents: diff changeset	7
4 046d584ed7b3 forgot lib path... casties parents: 3 diff changeset	8 use lib '/usr/local/mpiwg/archive';
0 30497c6a3eca Initial revision casties parents: diff changeset	9 use MPIWGStor;
30497c6a3eca Initial revision casties parents: diff changeset	10 use HarvestmetaHandler;
30497c6a3eca Initial revision casties parents: diff changeset	11
30497c6a3eca Initial revision casties parents: diff changeset	12 # make output unbuffered
30497c6a3eca Initial revision casties parents: diff changeset	13 $\|=1;
30497c6a3eca Initial revision casties parents: diff changeset	14
30497c6a3eca Initial revision casties parents: diff changeset	15 #######################################################
30497c6a3eca Initial revision casties parents: diff changeset	16 # internal parameters
30497c6a3eca Initial revision casties parents: diff changeset	17 #
30497c6a3eca Initial revision casties parents: diff changeset	18
30497c6a3eca Initial revision casties parents: diff changeset	19 # program version
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	20 my $version = "0.3 (27.9.2004)";
0 30497c6a3eca Initial revision casties parents: diff changeset	21
30497c6a3eca Initial revision casties parents: diff changeset	22 # read command line parameters
30497c6a3eca Initial revision casties parents: diff changeset	23 my $args = MPIWGStor::parseargs;
30497c6a3eca Initial revision casties parents: diff changeset	24
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	25 if (! scalar(%$args)) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	26 print "harvestmeta $version\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	27 print "use: harvestmeta -path=dir\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	28 print " reads all metadata info from directory dir into the database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	29 print "alternative sources:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	30 print " -indexurl=url : read XML index and follow links\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	31 print " -singleurl=url : read single index file\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	32 print "additional options:\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	33 print " -baseurl=url : clean all URL sources relative to this base\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	34 print " -debug : output debugging info\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	35 print " -purgedb : clear whole database\n";
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	36 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	37 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	38
0 30497c6a3eca Initial revision casties parents: diff changeset	39 # debug level
30497c6a3eca Initial revision casties parents: diff changeset	40 $debug = (exists $$args{'debug'}) ? $$args{'debug'} : 0;
30497c6a3eca Initial revision casties parents: diff changeset	41
30497c6a3eca Initial revision casties parents: diff changeset	42 # XML namespace (not really implemented!)
30497c6a3eca Initial revision casties parents: diff changeset	43 my $namespace = "";
30497c6a3eca Initial revision casties parents: diff changeset	44
30497c6a3eca Initial revision casties parents: diff changeset	45 # delete and rebuild database
30497c6a3eca Initial revision casties parents: diff changeset	46 my $purgeDB = (exists $$args{'purgedb'});
30497c6a3eca Initial revision casties parents: diff changeset	47
30497c6a3eca Initial revision casties parents: diff changeset	48 # database connection
30497c6a3eca Initial revision casties parents: diff changeset	49 my $dbh = DBI->connect("dbi:Pg:dbname=storage", "archiver", "");
30497c6a3eca Initial revision casties parents: diff changeset	50 if (! $dbh) {
30497c6a3eca Initial revision casties parents: diff changeset	51 logger('ABORT', "unable to connect to database!");
30497c6a3eca Initial revision casties parents: diff changeset	52 exit 1;
30497c6a3eca Initial revision casties parents: diff changeset	53 }
30497c6a3eca Initial revision casties parents: diff changeset	54 $dbh->{AutoCommit} = 0;
30497c6a3eca Initial revision casties parents: diff changeset	55 my $dbNextFileId;
30497c6a3eca Initial revision casties parents: diff changeset	56 my $dbNewFile;
30497c6a3eca Initial revision casties parents: diff changeset	57 my $dbNewMeta;
30497c6a3eca Initial revision casties parents: diff changeset	58 my $dbClearMeta;
30497c6a3eca Initial revision casties parents: diff changeset	59 my $dbFindFileName;
30497c6a3eca Initial revision casties parents: diff changeset	60 my $dbFindFilePath;
30497c6a3eca Initial revision casties parents: diff changeset	61 my $dbClearFile;
30497c6a3eca Initial revision casties parents: diff changeset	62 my $dbFindFileFlag;
30497c6a3eca Initial revision casties parents: diff changeset	63 my $dbFindFileFlagPath;
30497c6a3eca Initial revision casties parents: diff changeset	64 my $dbSetFileFlag;
30497c6a3eca Initial revision casties parents: diff changeset	65 my $dbClearAllFileFlag;
30497c6a3eca Initial revision casties parents: diff changeset	66
30497c6a3eca Initial revision casties parents: diff changeset	67 #######################################################
30497c6a3eca Initial revision casties parents: diff changeset	68 # check parameters that were passed to the program
30497c6a3eca Initial revision casties parents: diff changeset	69 #
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	70 my $baseurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	71 my $indexurl;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	72 my $singleurl;
0 30497c6a3eca Initial revision casties parents: diff changeset	73 my $basedir = $$args{'path'};
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	74 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	75 # strip trailing slashes
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	76 $basedir =~ s/\/$//;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	77 if (! -d $basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	78 logger("ABORT", "document directory \'$basedir\' doesn't exist!");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	79 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	80 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	81 } else {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	82 # use URL
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	83 $baseurl = $$args{'baseurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	84 $indexurl = $$args{'indexurl'};
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	85 $singleurl = $$args{'url'};
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	86 if (! (($indexurl)\|\|($singleurl))) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	87 logger("ABORT", "no document source given!");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	88 exit 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	89 }
0 30497c6a3eca Initial revision casties parents: diff changeset	90 }
30497c6a3eca Initial revision casties parents: diff changeset	91
30497c6a3eca Initial revision casties parents: diff changeset	92 my $metaParserHandler = HarvestmetaHandler->new;
30497c6a3eca Initial revision casties parents: diff changeset	93 my $metaParser = XML::SAX::ParserFactory->parser(Handler => $metaParserHandler);
30497c6a3eca Initial revision casties parents: diff changeset	94
30497c6a3eca Initial revision casties parents: diff changeset	95 #######################################################
30497c6a3eca Initial revision casties parents: diff changeset	96 # internal variables
30497c6a3eca Initial revision casties parents: diff changeset	97 #
30497c6a3eca Initial revision casties parents: diff changeset	98
30497c6a3eca Initial revision casties parents: diff changeset	99 # number of errors
30497c6a3eca Initial revision casties parents: diff changeset	100 my $errcnt = 0;
30497c6a3eca Initial revision casties parents: diff changeset	101 # number of warnings
30497c6a3eca Initial revision casties parents: diff changeset	102 my $warncnt = 0;
30497c6a3eca Initial revision casties parents: diff changeset	103
30497c6a3eca Initial revision casties parents: diff changeset	104 # number of files on fs
30497c6a3eca Initial revision casties parents: diff changeset	105 my $fcnt = 0;
30497c6a3eca Initial revision casties parents: diff changeset	106 # number of index files
30497c6a3eca Initial revision casties parents: diff changeset	107 my $idxcnt = 0;
30497c6a3eca Initial revision casties parents: diff changeset	108
30497c6a3eca Initial revision casties parents: diff changeset	109 #######################################################
30497c6a3eca Initial revision casties parents: diff changeset	110 # subroutines
30497c6a3eca Initial revision casties parents: diff changeset	111 #
30497c6a3eca Initial revision casties parents: diff changeset	112
30497c6a3eca Initial revision casties parents: diff changeset	113 #
30497c6a3eca Initial revision casties parents: diff changeset	114 # readAllFiles($realdir, $basedir, \%files, \%dirs)
30497c6a3eca Initial revision casties parents: diff changeset	115 #
30497c6a3eca Initial revision casties parents: diff changeset	116 # reads all files and directories below $realdir and puts the
30497c6a3eca Initial revision casties parents: diff changeset	117 # files in %files and directories in %dirs
30497c6a3eca Initial revision casties parents: diff changeset	118 # $basedir is only for recursion, it should be empty when called
30497c6a3eca Initial revision casties parents: diff changeset	119 # from outside
30497c6a3eca Initial revision casties parents: diff changeset	120 #
30497c6a3eca Initial revision casties parents: diff changeset	121 sub readAllFiles {
30497c6a3eca Initial revision casties parents: diff changeset	122 my ($directory, $basedir) = @_;
30497c6a3eca Initial revision casties parents: diff changeset	123 my $cnt = 0;
30497c6a3eca Initial revision casties parents: diff changeset	124
30497c6a3eca Initial revision casties parents: diff changeset	125 if (! opendir DIR, $directory) {
30497c6a3eca Initial revision casties parents: diff changeset	126 return 0;
30497c6a3eca Initial revision casties parents: diff changeset	127 }
30497c6a3eca Initial revision casties parents: diff changeset	128 my @dirfiles = readdir DIR;
30497c6a3eca Initial revision casties parents: diff changeset	129 foreach my $fn (@dirfiles) {
30497c6a3eca Initial revision casties parents: diff changeset	130 # ignore names starting with a dot
30497c6a3eca Initial revision casties parents: diff changeset	131 next if ($fn =~ /^\./);
30497c6a3eca Initial revision casties parents: diff changeset	132 # ignore other silly files
30497c6a3eca Initial revision casties parents: diff changeset	133 next if ($junk_files{$fn});
30497c6a3eca Initial revision casties parents: diff changeset	134
30497c6a3eca Initial revision casties parents: diff changeset	135 $cnt++;
30497c6a3eca Initial revision casties parents: diff changeset	136 $fcnt++;
30497c6a3eca Initial revision casties parents: diff changeset	137 my $f = "$directory/$fn";
30497c6a3eca Initial revision casties parents: diff changeset	138 my $docf = ($basedir) ? "$basedir/$fn" : $fn;
30497c6a3eca Initial revision casties parents: diff changeset	139 #logger('DEBUG', "fs_file: \"$f\"");
30497c6a3eca Initial revision casties parents: diff changeset	140 if (-f $f) {
30497c6a3eca Initial revision casties parents: diff changeset	141 #logger(" is file");
30497c6a3eca Initial revision casties parents: diff changeset	142 if ($fn eq "index.meta") {
30497c6a3eca Initial revision casties parents: diff changeset	143 harvestFile($fn, $directory);
30497c6a3eca Initial revision casties parents: diff changeset	144 }
30497c6a3eca Initial revision casties parents: diff changeset	145 } elsif (-d _) {
30497c6a3eca Initial revision casties parents: diff changeset	146 #logger(" is dir");
30497c6a3eca Initial revision casties parents: diff changeset	147 # recurse into directory
30497c6a3eca Initial revision casties parents: diff changeset	148 $cnt += readAllFiles($f, $docf);
30497c6a3eca Initial revision casties parents: diff changeset	149 }
30497c6a3eca Initial revision casties parents: diff changeset	150 }
30497c6a3eca Initial revision casties parents: diff changeset	151 return $cnt;
30497c6a3eca Initial revision casties parents: diff changeset	152 }
30497c6a3eca Initial revision casties parents: diff changeset	153
30497c6a3eca Initial revision casties parents: diff changeset	154 #
30497c6a3eca Initial revision casties parents: diff changeset	155 # cleanUnmarkedFiles($basepath)
30497c6a3eca Initial revision casties parents: diff changeset	156 #
30497c6a3eca Initial revision casties parents: diff changeset	157 # deletes all unflagged file and meta entries.
30497c6a3eca Initial revision casties parents: diff changeset	158 #
30497c6a3eca Initial revision casties parents: diff changeset	159 sub cleanUnmarkedFiles {
30497c6a3eca Initial revision casties parents: diff changeset	160 my ($basepath) = @_;
30497c6a3eca Initial revision casties parents: diff changeset	161 my $rv = $dbFindFileFlagPath->execute("${basepath}%");
30497c6a3eca Initial revision casties parents: diff changeset	162 my $ids = $dbFindFileFlagPath->fetchall_arrayref;
30497c6a3eca Initial revision casties parents: diff changeset	163 for my $i (@$ids) {
30497c6a3eca Initial revision casties parents: diff changeset	164 my $id = $$i[0];
30497c6a3eca Initial revision casties parents: diff changeset	165 logger('DEBUG', "cleaning file and meta of id: $id");
30497c6a3eca Initial revision casties parents: diff changeset	166 $dbClearMeta->execute($id);
30497c6a3eca Initial revision casties parents: diff changeset	167 $dbClearFile->execute($id);
30497c6a3eca Initial revision casties parents: diff changeset	168 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	169 }
30497c6a3eca Initial revision casties parents: diff changeset	170 }
30497c6a3eca Initial revision casties parents: diff changeset	171
30497c6a3eca Initial revision casties parents: diff changeset	172 #
30497c6a3eca Initial revision casties parents: diff changeset	173 # harvestFile($filename, $filepath)
30497c6a3eca Initial revision casties parents: diff changeset	174 #
30497c6a3eca Initial revision casties parents: diff changeset	175 # reads the index file $filename at $filepath and puts the contents
30497c6a3eca Initial revision casties parents: diff changeset	176 # in the database.
30497c6a3eca Initial revision casties parents: diff changeset	177 #
30497c6a3eca Initial revision casties parents: diff changeset	178 sub harvestFile {
30497c6a3eca Initial revision casties parents: diff changeset	179 my ($filename, $filepath) = @_;
30497c6a3eca Initial revision casties parents: diff changeset	180 logger('DEBUG', "looking at file '$filename' at '$filepath'");
30497c6a3eca Initial revision casties parents: diff changeset	181 # get file time
30497c6a3eca Initial revision casties parents: diff changeset	182 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
30497c6a3eca Initial revision casties parents: diff changeset	183 $atime,$mtime,$ctime,$blksize,$blocks)
30497c6a3eca Initial revision casties parents: diff changeset	184 = stat("$filepath/$filename");
30497c6a3eca Initial revision casties parents: diff changeset	185 my $filetime = stime($mtime);
30497c6a3eca Initial revision casties parents: diff changeset	186 # register file in db
30497c6a3eca Initial revision casties parents: diff changeset	187 my $fid = registerFile("$filepath/$filename", $filetime);
30497c6a3eca Initial revision casties parents: diff changeset	188 if ($fid) {
30497c6a3eca Initial revision casties parents: diff changeset	189 # file is new/modified
30497c6a3eca Initial revision casties parents: diff changeset	190 # parse index file
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	191 my $ret = eval{$metaParser->parse_uri("$filepath/$filename")};
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	192 if ($@) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	193 my $errmsg = $@;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	194 logger('ERROR', "error reading XML file '$filepath/$filename' ($errmsg)");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	195 $errcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	196 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	197 }
0 30497c6a3eca Initial revision casties parents: diff changeset	198 my @data = $metaParserHandler->getData();
30497c6a3eca Initial revision casties parents: diff changeset	199 logger('DEBUG', "parsed $#data+1 elements");
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	200 if ($data[0][0] eq "html") {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	201 # oops, wrong
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	202 logger('WARNING', "invalid HTML content in file $filepath/$filename");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	203 return;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	204 }
0 30497c6a3eca Initial revision casties parents: diff changeset	205 registerMeta($fid, @data);
30497c6a3eca Initial revision casties parents: diff changeset	206 }
30497c6a3eca Initial revision casties parents: diff changeset	207 $idxcnt++;
30497c6a3eca Initial revision casties parents: diff changeset	208 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
30497c6a3eca Initial revision casties parents: diff changeset	209 }
30497c6a3eca Initial revision casties parents: diff changeset	210
30497c6a3eca Initial revision casties parents: diff changeset	211 #
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	212 # readURLIndex($baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	213 #
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	214 # reads the XML index at $baseurl
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	215 # and processes all its entries
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	216 #
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	217 sub readURLIndex {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	218 my ($baseurl) = @_;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	219 my $cnt = 0;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	220
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	221 # parse index file
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	222 logger('DEBUG', "parsing $baseurl ...");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	223 $metaParser->parse_uri($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	224 my @indexdata = $metaParserHandler->getData();
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	225 logger('INFO', "parsed $#indexdata+1 index entries");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	226
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	227 foreach my $me (@indexdata) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	228 $cnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	229 my ($tag, $val, $attr) = @$me;
6 a3feffd94021 small fixes casties parents: 4 diff changeset	230 my $meta = "";
a3feffd94021 small fixes casties parents: 4 diff changeset	231 my $file = "";
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	232 my $mtime = "";
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	233 if ($tag =~ /index\/resource$/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	234 if ($attr =~ /metaLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	235 $meta = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	236 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	237 if ($attr =~ /resourceLink=\"([^\"]+)\"/) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	238 $file = $1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	239 }
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	240 if ($attr =~ /modificationDate=\"([^\"]+)\"/) {
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	241 $mtime = $1;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	242 }
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	243 if ($meta =~ /^http:/) {
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	244 harvestURL($meta, $file, $mtime);
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	245 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	246 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	247 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	248 return $cnt;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	249 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	250
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	251 #
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	252 # harvestURL($metaurl, $fileurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	253 #
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	254 # reads the index file from $metaurl and puts the contents
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	255 # in the database (under $filepath)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	256 #
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	257 sub harvestURL {
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	258 my ($metaurl, $fileurl, $filetime) = @_;
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	259 logger('DEBUG', "fetching from url '$metaurl' for '$fileurl'");
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	260 # if no filetime then now
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	261 $filetime = stime(time) unless ($filetime);
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	262 # register file in db
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	263 my $fid = registerFile("$fileurl", $filetime);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	264 if ($fid) {
8 65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	265 # try to parse index file
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	266 my $ret = eval{$metaParser->parse_uri($metaurl)};
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	267 if ($@) {
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	268 my $errmsg = $@;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	269 logger('ERROR', "error reading XML from '$metaurl' ($errmsg)");
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	270 $errcnt++;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	271 return;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	272 }
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	273 my @data = $metaParserHandler->getData();
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	274 logger('DEBUG', "parsed $#data+1 elements");
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	275 if (lc $data[0][0] eq "html") {
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	276 # oops, wrong
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	277 logger('WARNING', "invalid HTML content from $metaurl");
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	278 $warncnt++;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	279 return;
65895eec9e30 Added handling of modification date for HTTP feeds. casties parents: 6 diff changeset	280 }
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	281 # file is new/modified
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	282 registerMeta($fid, @data);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	283 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	284 $idxcnt++;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	285 logger('INFO', "$idxcnt index files of $fcnt") if ($idxcnt % 10 == 0) ;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	286 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	287
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	288
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	289 #
0 30497c6a3eca Initial revision casties parents: diff changeset	290 # $fileid = registerFile($filepath, $filetime)
30497c6a3eca Initial revision casties parents: diff changeset	291 #
30497c6a3eca Initial revision casties parents: diff changeset	292 # returns the file ID for the file $filepath. If necessary it
30497c6a3eca Initial revision casties parents: diff changeset	293 # will be added to the database. returns 0 if an update is not necessary.
30497c6a3eca Initial revision casties parents: diff changeset	294 #
30497c6a3eca Initial revision casties parents: diff changeset	295 sub registerFile {
30497c6a3eca Initial revision casties parents: diff changeset	296 my ($filepath, $filetime) = @_;
30497c6a3eca Initial revision casties parents: diff changeset	297 my $fileid = 0;
30497c6a3eca Initial revision casties parents: diff changeset	298 # look if file is in db
30497c6a3eca Initial revision casties parents: diff changeset	299 my $rv = $dbFindFileName->execute($filepath);
30497c6a3eca Initial revision casties parents: diff changeset	300 my $mtime;
30497c6a3eca Initial revision casties parents: diff changeset	301 ($fileid, $mtime) = $dbFindFileName->fetchrow_array;
30497c6a3eca Initial revision casties parents: diff changeset	302 if ($fileid) {
30497c6a3eca Initial revision casties parents: diff changeset	303 # file is in db
30497c6a3eca Initial revision casties parents: diff changeset	304 # update flag
30497c6a3eca Initial revision casties parents: diff changeset	305 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision casties parents: diff changeset	306 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	307 my $stime = s2stime($mtime);
30497c6a3eca Initial revision casties parents: diff changeset	308 if ($stime ge $filetime) {
30497c6a3eca Initial revision casties parents: diff changeset	309 # if its current return 0
30497c6a3eca Initial revision casties parents: diff changeset	310 logger('DEBUG', "file: $fileid is old! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision casties parents: diff changeset	311 return 0;
30497c6a3eca Initial revision casties parents: diff changeset	312 } else {
30497c6a3eca Initial revision casties parents: diff changeset	313 logger('DEBUG', "file: $fileid is new! time: '$stime' (vs '$filetime')");
30497c6a3eca Initial revision casties parents: diff changeset	314 }
30497c6a3eca Initial revision casties parents: diff changeset	315 }
30497c6a3eca Initial revision casties parents: diff changeset	316 if (! $fileid) {
30497c6a3eca Initial revision casties parents: diff changeset	317 # get a new file id
30497c6a3eca Initial revision casties parents: diff changeset	318 my $rv = $dbNextFileId->execute;
30497c6a3eca Initial revision casties parents: diff changeset	319 ($fileid) = $dbNextFileId->fetchrow_array;
30497c6a3eca Initial revision casties parents: diff changeset	320 logger('DEBUG', "DB newfile: id=$fileid filename=$filepath mtime=$filetime");
30497c6a3eca Initial revision casties parents: diff changeset	321 $dbNewFile->execute($fileid, $filepath, $filetime);
30497c6a3eca Initial revision casties parents: diff changeset	322 # update flag
30497c6a3eca Initial revision casties parents: diff changeset	323 $dbSetFileFlag->execute($fileid, 1);
30497c6a3eca Initial revision casties parents: diff changeset	324 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	325 }
30497c6a3eca Initial revision casties parents: diff changeset	326 return $fileid;
30497c6a3eca Initial revision casties parents: diff changeset	327 }
30497c6a3eca Initial revision casties parents: diff changeset	328
30497c6a3eca Initial revision casties parents: diff changeset	329 #
30497c6a3eca Initial revision casties parents: diff changeset	330 # registerMeta($fileid, @meta)
30497c6a3eca Initial revision casties parents: diff changeset	331 #
30497c6a3eca Initial revision casties parents: diff changeset	332 # adds the metadata information @meta for $fileid to the database.
30497c6a3eca Initial revision casties parents: diff changeset	333 #
30497c6a3eca Initial revision casties parents: diff changeset	334 sub registerMeta {
30497c6a3eca Initial revision casties parents: diff changeset	335 my ($fileid, @meta) = @_;
30497c6a3eca Initial revision casties parents: diff changeset	336 logger('DEBUG', "DB newmeta: fileid=$fileid ($#meta)");
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	337 # clear out old data
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	338 $dbClearMeta->execute($fileid);
0 30497c6a3eca Initial revision casties parents: diff changeset	339 my $idx = 0;
30497c6a3eca Initial revision casties parents: diff changeset	340 foreach my $keyval (@meta) {
30497c6a3eca Initial revision casties parents: diff changeset	341 #logger('DEBUG', " DB meta: $$keyval[0]=$$keyval[1]");
30497c6a3eca Initial revision casties parents: diff changeset	342 $dbNewMeta->execute($fileid, $idx++, $$keyval[0], $$keyval[2], $$keyval[1]);
30497c6a3eca Initial revision casties parents: diff changeset	343 }
30497c6a3eca Initial revision casties parents: diff changeset	344 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	345 logger('INFO', "added $idx elements (file $fileid)");
30497c6a3eca Initial revision casties parents: diff changeset	346 }
30497c6a3eca Initial revision casties parents: diff changeset	347
30497c6a3eca Initial revision casties parents: diff changeset	348 #
30497c6a3eca Initial revision casties parents: diff changeset	349 # initdb()
30497c6a3eca Initial revision casties parents: diff changeset	350 #
30497c6a3eca Initial revision casties parents: diff changeset	351 # initialises the database connection.
30497c6a3eca Initial revision casties parents: diff changeset	352 #
30497c6a3eca Initial revision casties parents: diff changeset	353 sub initDB {
30497c6a3eca Initial revision casties parents: diff changeset	354 my $rv;
30497c6a3eca Initial revision casties parents: diff changeset	355 # clean tables
30497c6a3eca Initial revision casties parents: diff changeset	356 if ($purgeDB) {
30497c6a3eca Initial revision casties parents: diff changeset	357 $rv = $dbh->do("delete from files");
30497c6a3eca Initial revision casties parents: diff changeset	358 $rv = $dbh->do("delete from meta");
30497c6a3eca Initial revision casties parents: diff changeset	359 if ($dbh->err) {
30497c6a3eca Initial revision casties parents: diff changeset	360 logger('ABORT', "unable to clean table!");
30497c6a3eca Initial revision casties parents: diff changeset	361 exit 1;
30497c6a3eca Initial revision casties parents: diff changeset	362 }
30497c6a3eca Initial revision casties parents: diff changeset	363 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	364 }
30497c6a3eca Initial revision casties parents: diff changeset	365
30497c6a3eca Initial revision casties parents: diff changeset	366 # clear flags
30497c6a3eca Initial revision casties parents: diff changeset	367 $rv = $dbh->do("create temporary table file_flags ( fileid integer primary key, flag integer )");
30497c6a3eca Initial revision casties parents: diff changeset	368 $dbh->commit;
30497c6a3eca Initial revision casties parents: diff changeset	369
30497c6a3eca Initial revision casties parents: diff changeset	370 # prepare statements
30497c6a3eca Initial revision casties parents: diff changeset	371 $dbNextFileId = $dbh->prepare("select nextval('files_id_seq')");
30497c6a3eca Initial revision casties parents: diff changeset	372 $dbNewFile = $dbh->prepare("insert into files (id, filename, mtime) values (?,?,?)");
30497c6a3eca Initial revision casties parents: diff changeset	373 $dbFindFileName = $dbh->prepare("select id,mtime from files where filename=?");
30497c6a3eca Initial revision casties parents: diff changeset	374 $dbFindFilePath = $dbh->prepare("select id,filename,flag from files where filename like ?");
30497c6a3eca Initial revision casties parents: diff changeset	375 $dbClearFile = $dbh->prepare("delete from files where id=?");
30497c6a3eca Initial revision casties parents: diff changeset	376 $dbFindFileFlag = $dbh->prepare("select fileid from file_flags where flag=?");
30497c6a3eca Initial revision casties parents: diff changeset	377 $dbFindFileFlagPath = $dbh->prepare("select id from files left outer join file_flags on files.id=file_flags.fileid where filename like ? and flag is null");
30497c6a3eca Initial revision casties parents: diff changeset	378 $dbSetFileFlag = $dbh->prepare("insert into file_flags (fileid, flag) values (?,?)");
30497c6a3eca Initial revision casties parents: diff changeset	379 $dbNewMeta = $dbh->prepare("insert into meta (fileid, idx, tags, attributes, content) values (?,?,?,?,?)");
30497c6a3eca Initial revision casties parents: diff changeset	380 $dbClearMeta = $dbh->prepare("delete from meta where fileid=?");
30497c6a3eca Initial revision casties parents: diff changeset	381
30497c6a3eca Initial revision casties parents: diff changeset	382 }
30497c6a3eca Initial revision casties parents: diff changeset	383
30497c6a3eca Initial revision casties parents: diff changeset	384 #######################################################
30497c6a3eca Initial revision casties parents: diff changeset	385 # main
30497c6a3eca Initial revision casties parents: diff changeset	386 #
30497c6a3eca Initial revision casties parents: diff changeset	387
30497c6a3eca Initial revision casties parents: diff changeset	388 logger("INFO", "harvestmeta $version");
30497c6a3eca Initial revision casties parents: diff changeset	389
30497c6a3eca Initial revision casties parents: diff changeset	390 initDB();
6 a3feffd94021 small fixes casties parents: 4 diff changeset	391 my $fnum = 0;
0 30497c6a3eca Initial revision casties parents: diff changeset	392
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	393 if ($basedir) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	394 # read and process all files under $basedir
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	395 $fnum = readAllFiles($basedir, "");
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	396 # delete orphaned data (under $basedir)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	397 cleanUnmarkedFiles($basedir);
6 a3feffd94021 small fixes casties parents: 4 diff changeset	398 } elsif ($indexurl) {
3 1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	399 # read and process XML index
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	400 $fnum = readURLIndex($indexurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	401 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	402 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	403 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	404 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	405 } elsif ($singleurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	406 # read and process single XML url
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	407 harvestURL($singleurl, $singleurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	408 $fnum = 1;
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	409 if ($baseurl) {
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	410 # delete orphaned data (under $baseurl)
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	411 cleanUnmarkedFiles($baseurl);
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	412 }
1a51f94d5dbd new version also reads XML index feeds via HTTP casties parents: 0 diff changeset	413 }
0 30497c6a3eca Initial revision casties parents: diff changeset	414
30497c6a3eca Initial revision casties parents: diff changeset	415 logger("INFO", "analysed $idxcnt of $fnum files!");
30497c6a3eca Initial revision casties parents: diff changeset	416 logger("INFO", "$warncnt warnings");
30497c6a3eca Initial revision casties parents: diff changeset	417 logger("INFO", "$errcnt errors");
30497c6a3eca Initial revision casties parents: diff changeset	418 if ($errcnt > 0) {
30497c6a3eca Initial revision casties parents: diff changeset	419 logger("ABORT", "there were errors!");
30497c6a3eca Initial revision casties parents: diff changeset	420 exit 1;
30497c6a3eca Initial revision casties parents: diff changeset	421 } else {
30497c6a3eca Initial revision casties parents: diff changeset	422 logger("DONE", "all index files read successfully!");
30497c6a3eca Initial revision casties parents: diff changeset	423 }

Mercurial > hg > foxridge-archiver

annotate harvestmeta.pl @ 18:fdf4ceb36db1