0
|
1 import re, urlparse
|
|
2
|
|
3 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
|
|
4
|
|
5 def ready_queue(address, html):
|
|
6 url = urlparse.urlparse(str(address))
|
|
7 links = linkregex.findall(html)
|
|
8 queue = []
|
|
9 for link in links:
|
|
10 if link.startswith("/"):
|
|
11 queue.append('http://'+url[1]+link)
|
|
12 elif link.startswith("http") or link.startswith("https"):
|
|
13
|
|
14 #DW: only MPIWG
|
|
15 if link.startswith("http://www.mpiwg-berlin.mpg.de"):
|
|
16
|
|
17 queue.append(link)
|
|
18 elif link.startswith("#"):
|
|
19 continue
|
|
20 else:
|
|
21 queue.append(urlparse.urljoin(url.geturl(),link))
|
|
22 return queue
|
|
23 |