annotate ready_queue.py @ 0:57e2aa489383

initial
author dwinter
date Fri, 12 Oct 2012 15:23:33 +0200
parents
children 768cb7284374
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e2aa489383 initial
dwinter
parents:
diff changeset
1 import re, urlparse
57e2aa489383 initial
dwinter
parents:
diff changeset
2
57e2aa489383 initial
dwinter
parents:
diff changeset
3 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
57e2aa489383 initial
dwinter
parents:
diff changeset
4
57e2aa489383 initial
dwinter
parents:
diff changeset
5 def ready_queue(address, html):
57e2aa489383 initial
dwinter
parents:
diff changeset
6 url = urlparse.urlparse(str(address))
57e2aa489383 initial
dwinter
parents:
diff changeset
7 links = linkregex.findall(html)
57e2aa489383 initial
dwinter
parents:
diff changeset
8 queue = []
57e2aa489383 initial
dwinter
parents:
diff changeset
9 for link in links:
57e2aa489383 initial
dwinter
parents:
diff changeset
10 if link.startswith("/"):
57e2aa489383 initial
dwinter
parents:
diff changeset
11 queue.append('http://'+url[1]+link)
57e2aa489383 initial
dwinter
parents:
diff changeset
12 elif link.startswith("http") or link.startswith("https"):
57e2aa489383 initial
dwinter
parents:
diff changeset
13
57e2aa489383 initial
dwinter
parents:
diff changeset
14 #DW: only MPIWG
57e2aa489383 initial
dwinter
parents:
diff changeset
15 if link.startswith("http://www.mpiwg-berlin.mpg.de"):
57e2aa489383 initial
dwinter
parents:
diff changeset
16
57e2aa489383 initial
dwinter
parents:
diff changeset
17 queue.append(link)
57e2aa489383 initial
dwinter
parents:
diff changeset
18 elif link.startswith("#"):
57e2aa489383 initial
dwinter
parents:
diff changeset
19 continue
57e2aa489383 initial
dwinter
parents:
diff changeset
20 else:
57e2aa489383 initial
dwinter
parents:
diff changeset
21 queue.append(urlparse.urljoin(url.geturl(),link))
57e2aa489383 initial
dwinter
parents:
diff changeset
22 return queue
57e2aa489383 initial
dwinter
parents:
diff changeset
23