Mercurial > hg > PyCrawler
changeset 1:768cb7284374
lots of debug statements
author | dwinter |
---|---|
date | Mon, 15 Oct 2012 10:17:44 +0200 |
parents | 57e2aa489383 |
children | 6d8b6a689b2b |
files | ColorStreamHandler.py PyCrawler.db PyCrawler.py content_processor.py query.py ready_queue.py |
diffstat | 6 files changed, 56 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/ColorStreamHandler.py Fri Oct 12 15:23:33 2012 +0200 +++ b/ColorStreamHandler.py Mon Oct 15 10:17:44 2012 +0200 @@ -35,17 +35,19 @@ return msg def emit(self, record): - record.msg = record.msg.encode('utf-8', 'ignore') - msg = self.format(record) + try: + record.msg = record.msg.encode('utf-8', 'ignore') + msg = self.format(record) - # This just removes the date and milliseconds from asctime - temp = msg.split(']') - msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1] + # This just removes the date and milliseconds from asctime + temp = msg.split(']') + msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1] - if self.use_colors: + if self.use_colors: msg = self.color(msg, record.levelname) - print msg - + print msg + except: + pass # 'record' has the following attributes: # threadName # name @@ -65,4 +67,4 @@ # funcName # relativeCreated # levelname -# msecs \ No newline at end of file +# msecs
--- a/PyCrawler.py Fri Oct 12 15:23:33 2012 +0200 +++ b/PyCrawler.py Mon Oct 15 10:17:44 2012 +0200 @@ -31,9 +31,12 @@ def crawl(): logger.info("Starting (%s)..." % sys.argv[1]) + logger.info(":::") while True: + logger.info("XXXX") url = cdb.dequeue() u = urlparse.urlparse(url) + logger.info(url) robot.set_url('http://'+u[1]+"/robots.txt") if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')): logger.warning("Url disallowed by robots.txt: %s " % url) @@ -43,14 +46,19 @@ continue if cdb.checkCrawled(url): + logger.info("already done") continue if url is False: break status = 0 - req = urllib2.Request(str(url)) + + try: + req = urllib2.Request(str(url)) + except: + continue; req.add_header('User-Agent', 'PyCrawler 0.2.0') request = None - + logger.info("now opening") try: request = urllib2.urlopen(req) except urllib2.URLError, e: @@ -61,12 +69,16 @@ if status == 0: status = 200 data = request.read() + logger.info("read") processor.setInfo(str(url), status, data) + logger.info("read2") ret = processor.process() + logger.info("read3") if status != 200: continue add_queue = [] for q in ret: + logger.debug(".") if not cdb.checkCrawled(q): add_queue.append(q) @@ -78,7 +90,7 @@ cdb.enqueue(add_queue) cdb.addPage(processor.getDataDict()) processor.reset() - + logger.info("done") logger.info("Finishing...") cdb.close() logger.info("Done! Goodbye!")
--- a/content_processor.py Fri Oct 12 15:23:33 2012 +0200 +++ b/content_processor.py Mon Oct 15 10:17:44 2012 +0200 @@ -76,8 +76,10 @@ # returns links to queue def processBody(self): + logger.info("body1") queue = ready_queue(self.url, self.body) - #print "found %i links to queue" % len(queue) + logger.info("body2") + print "found %i links to queue" % len(queue) self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 @@ -123,17 +125,22 @@ # returns the queue from processBody def process(self): + + logger.info("process1") text_lower = self.text.lower() self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')] self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')] self.processHead() + logger.info("process2") self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')] queue = self.processBody() + logger.info("process3") self.combineKeywordLists() + logger.info("process4") return queue def getDataDict(self): for k,v in self.keywords.items(): if v < 3: del self.keywords[k] - return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords} \ No newline at end of file + return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
--- a/query.py Fri Oct 12 15:23:33 2012 +0200 +++ b/query.py Mon Oct 15 10:17:44 2012 +0200 @@ -56,8 +56,11 @@ return False if len(urls) == 0: return True - args = [{'address':unicode(u)} for u in urls] - result = self.connection.execute(self.queue_table.insert(), args) + try: + args = [{'address':unicode(u)} for u in urls] + result = self.connection.execute(self.queue_table.insert(), args) + except: + return False if result: return True return False @@ -81,15 +84,17 @@ return False def checkCrawled(self, url): - s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url)) - result = self.connection.execute(s) - if len(result.fetchall()) > 0: - result.close() - return True - else: - result.close() + try: + s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url)) + result = self.connection.execute(s) + if len(result.fetchall()) > 0: + result.close() + return True + else: + result.close() + return False + except: return False - # Data should be a dictionary containing the following # key : desc # address : the url of the page @@ -116,4 +121,4 @@ return True def close(self): - self.connection.close() \ No newline at end of file + self.connection.close()
--- a/ready_queue.py Fri Oct 12 15:23:33 2012 +0200 +++ b/ready_queue.py Mon Oct 15 10:17:44 2012 +0200 @@ -1,12 +1,16 @@ import re, urlparse +import logging +logger = logging.getLogger("crawler_logger") linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') def ready_queue(address, html): + logger.info("ready_queue:"+str(address)) url = urlparse.urlparse(str(address)) links = linkregex.findall(html) queue = [] for link in links: + logger.info(link) if link.startswith("/"): queue.append('http://'+url[1]+link) elif link.startswith("http") or link.startswith("https"): @@ -20,4 +24,4 @@ else: queue.append(urlparse.urljoin(url.geturl(),link)) return queue - \ No newline at end of file +