Mercurial > hg > PyCrawler
view ready_queue.py @ 1:768cb7284374
lots of debug statements
author | dwinter |
---|---|
date | Mon, 15 Oct 2012 10:17:44 +0200 |
parents | 57e2aa489383 |
children | 6d8b6a689b2b |
line wrap: on
line source
import re, urlparse import logging logger = logging.getLogger("crawler_logger") linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') def ready_queue(address, html): logger.info("ready_queue:"+str(address)) url = urlparse.urlparse(str(address)) links = linkregex.findall(html) queue = [] for link in links: logger.info(link) if link.startswith("/"): queue.append('http://'+url[1]+link) elif link.startswith("http") or link.startswith("https"): #DW: only MPIWG if link.startswith("http://www.mpiwg-berlin.mpg.de"): queue.append(link) elif link.startswith("#"): continue else: queue.append(urlparse.urljoin(url.geturl(),link)) return queue