view content_processor.py @ 2:6d8b6a689b2b default tip

changed to bs4
author dwinter
date Mon, 15 Oct 2012 15:09:35 +0200
parents 768cb7284374
children
line wrap: on
line source

from multiprocessing import Pool
import re, sys, logging

from ready_queue import ready_queue

logger = logging.getLogger("crawler_logger")

def rankKeywords(text):
	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
	ranks = {}
	text = text.split(' ')
	for t in text:
		if t in invalid_keywords:
			continue
		if not ranks.has_key(t):
			ranks[t] = 1
		else:
			ranks[t] += 1
	return ranks

def stripPunctuation(text):
	pattern = re.compile(r'[^\w\s]')
	return pattern.sub(' ', text)

def stripScript(text):
	pattern = re.compile(r'<script.*?\/script>')
	return pattern.sub(' ', text)

class ContentProcessor:
	
	def __init__(self, url, status, text):
		self.keyword_dicts = []
		self.invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
		self.keywords = {}
		self.text = text
		self.size = 0
		self.url = url
		self.status = status

	def setText(self, text):
		self.text = text
		self.size = len(text)

	def setUrl(self, url):
		self.url = url

	def setStatus(self, status):
		self.status = status

	def setInfo(self, url, status, text):
		self.url = url
		self.status = status
		self.text = text
		self.size = len(text)

	def reset(self):
		self.keyword_dicts = []
		self.keywords = {}
		self.text = None
		self.head = None
		self.body = None
		self.title = None
		self.size = 0
		self.status = None

	def combineKeywordLists(self):
		if len(self.keyword_dicts) == 1:
			self.keywords = self.keyword_dicts[0]
			return
		for l in self.keyword_dicts:
			for k,v in l.items():
				if self.keywords.has_key(k):
					self.keywords[k] += v
				else:
					self.keywords[k] = v
	
	# returns links to queue	
	def processBody(self):
		logger.info("body1")
		queue = ready_queue(self.url, self.body)
		logger.info("body2")
		print "found %i links to queue" % len(queue)
		self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
		if len(self.text) > 5000:
			offset = 0
			i = 0
			l = []
			while True:
				j = self.findnth(self.text[i:],' ',500)
				offset += j
				if j == -1:
					break
				l.append(self.text[i:j])
				i = offset + j+1
			logger.debug("processing with %i threads" % len(l))
			try:
				if len(l) == 0:
					return []
				pool = Pool(processes=(len(l)))
				self.keyword_dicts = pool.map(rankKeywords, l)
			except KeyboardInterrupt:
				pool.terminate()
				pool.join()
				sys.exit()
			else:
				pool.close()
				pool.join()
			logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
		else:
			self.keyword_dicts.append(rankKeywords(self.text))
		return queue
		
	def processHead(self):
		pass

	def remove_html_tags(self, data):
		p = re.compile(r'<.*?>')
		return p.sub('', data)

	def findnth(self, haystack, needle, n):
		parts = haystack.split(needle, n)
		if len(parts) <= n:
			return -1
		return len(haystack)-len(parts[-1])-len(needle)

	# returns the queue from processBody
	def process(self):
		
		logger.info("process1")
		text_lower = self.text.lower()
		self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
		self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
		self.processHead()
		logger.info("process2")
		self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
		queue = self.processBody()
		logger.info("process3")
		self.combineKeywordLists()
		logger.info("process4")
		return queue

	def getDataDict(self):
		for k,v in self.keywords.items():
			if v < 3:
				del self.keywords[k]
		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}