view PyCrawler.py @ 2:6d8b6a689b2b default tip

changed to bs4
author dwinter
date Mon, 15 Oct 2012 15:09:35 +0200
parents 768cb7284374
children
line wrap: on
line source

from query import CrawlerDb
from content_processor import ContentProcessor
from settings import LOGGING
import sys, urlparse, urllib2, shutil, glob, robotparser
import logging, logging.config
import traceback

# ===== Init stuff =====

# db init
cdb = CrawlerDb()
cdb.connect()

# content processor init
processor = ContentProcessor(None, None, None)

# logging setup
logging.config.dictConfig(LOGGING)
logger = logging.getLogger("crawler_logger")

# robot parser init
robot = robotparser.RobotFileParser()

if len(sys.argv) < 2:
	logger.info("Error: No start url was passed")
	sys.exit()

l = sys.argv[1:]

cdb.enqueue(l)

def crawl():
	logger.info("Starting (%s)..." % sys.argv[1])
	logger.info(":::")
	while True:
		logger.info("XXXX")
		url = cdb.dequeue()
		u = urlparse.urlparse(url)
		logger.info(url)
		robot.set_url('http://'+u[1]+"/robots.txt")
		if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
			logger.warning("Url disallowed by robots.txt: %s " % url)
			continue
		if not url.startswith('http'):
			logger.warning("Unfollowable link found at %s " % url)
			continue

		if cdb.checkCrawled(url):
			logger.info("already done")
			continue
		if url is False:
			break
		status = 0
	
		
		try:
			req = urllib2.Request(str(url))
		except:
			continue;
		req.add_header('User-Agent', 'PyCrawler 0.2.0')
		request = None
		logger.info("now opening")
		try:
			request = urllib2.urlopen(req)
		except urllib2.URLError, e:
			logger.error("Exception at url: %s\n%s" % (url, e))
			continue
		except urllib2.HTTPError, e:
			status = e.code
		if status == 0:
			status = 200
		data = request.read()
		logger.info("read")
		processor.setInfo(str(url), status, data)
		logger.info("read2")
		ret = processor.process()
		logger.info("read3")
		if status != 200:
			continue
		add_queue = []
		for q in ret:
			logger.debug(".")
			if not cdb.checkCrawled(q):
				add_queue.append(q)

		processor.setInfo(str(url), status, data)
		add_queue = processor.process()
		l = len(add_queue)
		logger.info("Got %s status from %s (Found %i links)" % (status, url, l))
		if l > 0:
			cdb.enqueue(add_queue)	
		cdb.addPage(processor.getDataDict())
		processor.reset()
		logger.info("done")
	logger.info("Finishing...")
	cdb.close()
	logger.info("Done! Goodbye!")

if __name__ == "__main__":
	try:
		crawl()
	except KeyboardInterrupt:
		logger.error("Stopping (KeyboardInterrupt)")
		sys.exit()
	except Exception, e:
		logger.error("EXCEPTION: %s " % e)
		traceback.print_exc()