Mercurial > hg > PyCrawler

import re, urlparse
import logging
from bs4 import BeautifulSoup

logger = logging.getLogger("crawler_logger")
linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

def ready_queue(address, html):

	logger.info("ready_queue:"+str(address))

	url = urlparse.urlparse(str(address))
	logger.info("ready_START  parse BS")


	#links = linkregex.findall(html)

	parser=BeautifulSoup(html)
	logger.info("ready_END  parse BS")
	#tags = parser.findAll(name="a")


	queue = []

	for linkTag in parser.find_all('a'):
		link = linkTag.get("href")

		if link == None:
			continue

		if link.startswith("/"):
			queue.append('http://'+url[1]+link)
		elif link.startswith("http") or link.startswith("https"):

			#DW: only MPIWG
			if link.startswith("http://www.mpiwg-berlin.mpg.de"):

					queue.append(link)
		elif link.startswith("#"):
			continue
		else:
			queue.append(urlparse.urljoin(url.geturl(),link))
	return queue
author	dwinter
date	Mon, 15 Oct 2012 15:09:35 +0200
parents	768cb7284374
children