Mercurial > hg > PyCrawler

--- a/ColorStreamHandler.py	Fri Oct 12 15:23:33 2012 +0200
+++ b/ColorStreamHandler.py	Mon Oct 15 10:17:44 2012 +0200
@@ -35,17 +35,19 @@
 			return msg

 	def emit(self, record):
-		record.msg = record.msg.encode('utf-8', 'ignore')
-		msg = self.format(record)
+		try:
+		 record.msg = record.msg.encode('utf-8', 'ignore')
+		 msg = self.format(record)

-		# This just removes the date and milliseconds from asctime
-		temp = msg.split(']')
-		msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]
+		 # This just removes the date and milliseconds from asctime
+		 temp = msg.split(']')
+		 msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]

-		if self.use_colors:
+		 if self.use_colors:
 			msg = self.color(msg, record.levelname)
-		print msg
-
+		 print msg
+		except:
+			pass
 # 'record' has the following attributes:
 # threadName
 # name
@@ -65,4 +67,4 @@
 # funcName
 # relativeCreated
 # levelname
-# msecs
\ No newline at end of file
+# msecs
Binary file PyCrawler.db has changed
--- a/PyCrawler.py	Fri Oct 12 15:23:33 2012 +0200
+++ b/PyCrawler.py	Mon Oct 15 10:17:44 2012 +0200
@@ -31,9 +31,12 @@

 def crawl():
 	logger.info("Starting (%s)..." % sys.argv[1])
+	logger.info(":::")
 	while True:
+		logger.info("XXXX")
 		url = cdb.dequeue()
 		u = urlparse.urlparse(url)
+		logger.info(url)
 		robot.set_url('http://'+u[1]+"/robots.txt")
 		if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
 			logger.warning("Url disallowed by robots.txt: %s " % url)
@@ -43,14 +46,19 @@
 			continue

 		if cdb.checkCrawled(url):
+			logger.info("already done")
 			continue
 		if url is False:
 			break
 		status = 0
-		req = urllib2.Request(str(url))
+
+		try:
+			req = urllib2.Request(str(url))
+		except:
+			continue;
 		req.add_header('User-Agent', 'PyCrawler 0.2.0')
 		request = None
-
+		logger.info("now opening")
 		try:
 			request = urllib2.urlopen(req)
 		except urllib2.URLError, e:
@@ -61,12 +69,16 @@
 		if status == 0:
 			status = 200
 		data = request.read()
+		logger.info("read")
 		processor.setInfo(str(url), status, data)
+		logger.info("read2")
 		ret = processor.process()
+		logger.info("read3")
 		if status != 200:
 			continue
 		add_queue = []
 		for q in ret:
+			logger.debug(".")
 			if not cdb.checkCrawled(q):
 				add_queue.append(q)

@@ -78,7 +90,7 @@
 			cdb.enqueue(add_queue)
 		cdb.addPage(processor.getDataDict())
 		processor.reset()
-
+		logger.info("done")
 	logger.info("Finishing...")
 	cdb.close()
 	logger.info("Done! Goodbye!")
--- a/content_processor.py	Fri Oct 12 15:23:33 2012 +0200
+++ b/content_processor.py	Mon Oct 15 10:17:44 2012 +0200
@@ -76,8 +76,10 @@

 	# returns links to queue
 	def processBody(self):
+		logger.info("body1")
 		queue = ready_queue(self.url, self.body)
-		#print "found %i links to queue" % len(queue)
+		logger.info("body2")
+		print "found %i links to queue" % len(queue)
 		self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
 		if len(self.text) > 5000:
 			offset = 0
@@ -123,17 +125,22 @@

 	# returns the queue from processBody
 	def process(self):
+
+		logger.info("process1")
 		text_lower = self.text.lower()
 		self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
 		self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
 		self.processHead()
+		logger.info("process2")
 		self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
 		queue = self.processBody()
+		logger.info("process3")
 		self.combineKeywordLists()
+		logger.info("process4")
 		return queue

 	def getDataDict(self):
 		for k,v in self.keywords.items():
 			if v < 3:
 				del self.keywords[k]
-		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
\ No newline at end of file
+		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
--- a/query.py	Fri Oct 12 15:23:33 2012 +0200
+++ b/query.py	Mon Oct 15 10:17:44 2012 +0200
@@ -56,8 +56,11 @@
 			return False
 		if len(urls) == 0:
 			return True
-		args = [{'address':unicode(u)} for u in urls]
-		result = self.connection.execute(self.queue_table.insert(), args)
+		try:
+			args = [{'address':unicode(u)} for u in urls]
+			result = self.connection.execute(self.queue_table.insert(), args)
+		except:
+			return False
 		if result:
 			return True
 		return False
@@ -81,15 +84,17 @@
 		return False

 	def checkCrawled(self, url):
-		s =  select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
-		result = self.connection.execute(s)
-		if len(result.fetchall()) > 0:
-			result.close()
-			return True
-		else:
-			result.close()
+		try:
+			s =  select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
+			result = self.connection.execute(s)
+			if len(result.fetchall()) > 0:
+				result.close()
+				return True
+			else:
+				result.close()
+				return False
+		except:
 			return False
-
 	# Data should be a dictionary containing the following
 	# key : desc
 	# 	address : the url of the page
@@ -116,4 +121,4 @@
 		return True

 	def close(self):
-		self.connection.close()
\ No newline at end of file
+		self.connection.close()
--- a/ready_queue.py	Fri Oct 12 15:23:33 2012 +0200
+++ b/ready_queue.py	Mon Oct 15 10:17:44 2012 +0200
@@ -1,12 +1,16 @@
 import re, urlparse
+import logging

+logger = logging.getLogger("crawler_logger")
 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

 def ready_queue(address, html):
+	logger.info("ready_queue:"+str(address))
 	url = urlparse.urlparse(str(address))
 	links = linkregex.findall(html)
 	queue = []
 	for link in links:
+		logger.info(link)
 		if link.startswith("/"):
 			queue.append('http://'+url[1]+link)
 		elif link.startswith("http") or link.startswith("https"):
@@ -20,4 +24,4 @@
 		else:
 			queue.append(urlparse.urljoin(url.geturl(),link))
 	return queue
-	
\ No newline at end of file
+