Mercurial > hg > PyCrawler
view ready_queue.py @ 2:6d8b6a689b2b default tip
changed to bs4
author | dwinter |
---|---|
date | Mon, 15 Oct 2012 15:09:35 +0200 |
parents | 768cb7284374 |
children |
line wrap: on
line source
import re, urlparse import logging from bs4 import BeautifulSoup logger = logging.getLogger("crawler_logger") linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') def ready_queue(address, html): logger.info("ready_queue:"+str(address)) url = urlparse.urlparse(str(address)) logger.info("ready_START parse BS") #links = linkregex.findall(html) parser=BeautifulSoup(html) logger.info("ready_END parse BS") #tags = parser.findAll(name="a") queue = [] for linkTag in parser.find_all('a'): link = linkTag.get("href") if link == None: continue if link.startswith("/"): queue.append('http://'+url[1]+link) elif link.startswith("http") or link.startswith("https"): #DW: only MPIWG if link.startswith("http://www.mpiwg-berlin.mpg.de"): queue.append(link) elif link.startswith("#"): continue else: queue.append(urlparse.urljoin(url.geturl(),link)) return queue