view ready_queue.py @ 1:768cb7284374

lots of debug statements
author dwinter
date Mon, 15 Oct 2012 10:17:44 +0200
parents 57e2aa489383
children 6d8b6a689b2b
line wrap: on
line source

import re, urlparse
import logging

logger = logging.getLogger("crawler_logger")
linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

def ready_queue(address, html):
	logger.info("ready_queue:"+str(address))
	url = urlparse.urlparse(str(address))
	links = linkregex.findall(html)
	queue = []
	for link in links:
		logger.info(link)
		if link.startswith("/"):
			queue.append('http://'+url[1]+link)
		elif link.startswith("http") or link.startswith("https"):
			
			#DW: only MPIWG
			if link.startswith("http://www.mpiwg-berlin.mpg.de"):
			
					queue.append(link)
		elif link.startswith("#"):
			continue
		else:
			queue.append(urlparse.urljoin(url.geturl(),link))
	return queue