# HG changeset patch # User dwinter # Date 1350289064 -7200 # Node ID 768cb72843745931fa57265e41ea113d70b6fb46 # Parent 57e2aa489383e4afc97c55680054f864ab6b8315 lots of debug statements diff -r 57e2aa489383 -r 768cb7284374 ColorStreamHandler.py --- a/ColorStreamHandler.py Fri Oct 12 15:23:33 2012 +0200 +++ b/ColorStreamHandler.py Mon Oct 15 10:17:44 2012 +0200 @@ -35,17 +35,19 @@ return msg def emit(self, record): - record.msg = record.msg.encode('utf-8', 'ignore') - msg = self.format(record) + try: + record.msg = record.msg.encode('utf-8', 'ignore') + msg = self.format(record) - # This just removes the date and milliseconds from asctime - temp = msg.split(']') - msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1] + # This just removes the date and milliseconds from asctime + temp = msg.split(']') + msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1] - if self.use_colors: + if self.use_colors: msg = self.color(msg, record.levelname) - print msg - + print msg + except: + pass # 'record' has the following attributes: # threadName # name @@ -65,4 +67,4 @@ # funcName # relativeCreated # levelname -# msecs \ No newline at end of file +# msecs diff -r 57e2aa489383 -r 768cb7284374 PyCrawler.db Binary file PyCrawler.db has changed diff -r 57e2aa489383 -r 768cb7284374 PyCrawler.py --- a/PyCrawler.py Fri Oct 12 15:23:33 2012 +0200 +++ b/PyCrawler.py Mon Oct 15 10:17:44 2012 +0200 @@ -31,9 +31,12 @@ def crawl(): logger.info("Starting (%s)..." % sys.argv[1]) + logger.info(":::") while True: + logger.info("XXXX") url = cdb.dequeue() u = urlparse.urlparse(url) + logger.info(url) robot.set_url('http://'+u[1]+"/robots.txt") if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')): logger.warning("Url disallowed by robots.txt: %s " % url) @@ -43,14 +46,19 @@ continue if cdb.checkCrawled(url): + logger.info("already done") continue if url is False: break status = 0 - req = urllib2.Request(str(url)) + + try: + req = urllib2.Request(str(url)) + except: + continue; req.add_header('User-Agent', 'PyCrawler 0.2.0') request = None - + logger.info("now opening") try: request = urllib2.urlopen(req) except urllib2.URLError, e: @@ -61,12 +69,16 @@ if status == 0: status = 200 data = request.read() + logger.info("read") processor.setInfo(str(url), status, data) + logger.info("read2") ret = processor.process() + logger.info("read3") if status != 200: continue add_queue = [] for q in ret: + logger.debug(".") if not cdb.checkCrawled(q): add_queue.append(q) @@ -78,7 +90,7 @@ cdb.enqueue(add_queue) cdb.addPage(processor.getDataDict()) processor.reset() - + logger.info("done") logger.info("Finishing...") cdb.close() logger.info("Done! Goodbye!") diff -r 57e2aa489383 -r 768cb7284374 content_processor.py --- a/content_processor.py Fri Oct 12 15:23:33 2012 +0200 +++ b/content_processor.py Mon Oct 15 10:17:44 2012 +0200 @@ -76,8 +76,10 @@ # returns links to queue def processBody(self): + logger.info("body1") queue = ready_queue(self.url, self.body) - #print "found %i links to queue" % len(queue) + logger.info("body2") + print "found %i links to queue" % len(queue) self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 @@ -123,17 +125,22 @@ # returns the queue from processBody def process(self): + + logger.info("process1") text_lower = self.text.lower() self.title = self.text[text_lower.find('