Mercurial > hg > PyCrawler
comparison ready_queue.py @ 0:57e2aa489383
initial
author | dwinter |
---|---|
date | Fri, 12 Oct 2012 15:23:33 +0200 |
parents | |
children | 768cb7284374 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:57e2aa489383 |
---|---|
1 import re, urlparse | |
2 | |
3 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') | |
4 | |
5 def ready_queue(address, html): | |
6 url = urlparse.urlparse(str(address)) | |
7 links = linkregex.findall(html) | |
8 queue = [] | |
9 for link in links: | |
10 if link.startswith("/"): | |
11 queue.append('http://'+url[1]+link) | |
12 elif link.startswith("http") or link.startswith("https"): | |
13 | |
14 #DW: only MPIWG | |
15 if link.startswith("http://www.mpiwg-berlin.mpg.de"): | |
16 | |
17 queue.append(link) | |
18 elif link.startswith("#"): | |
19 continue | |
20 else: | |
21 queue.append(urlparse.urljoin(url.geturl(),link)) | |
22 return queue | |
23 |