Mercurial > hg > PyCrawler
comparison ready_queue.py @ 2:6d8b6a689b2b default tip
changed to bs4
author | dwinter |
---|---|
date | Mon, 15 Oct 2012 15:09:35 +0200 |
parents | 768cb7284374 |
children |
comparison
equal
deleted
inserted
replaced
1:768cb7284374 | 2:6d8b6a689b2b |
---|---|
1 import re, urlparse | 1 import re, urlparse |
2 import logging | 2 import logging |
3 from bs4 import BeautifulSoup | |
3 | 4 |
4 logger = logging.getLogger("crawler_logger") | 5 logger = logging.getLogger("crawler_logger") |
5 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') | 6 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') |
6 | 7 |
7 def ready_queue(address, html): | 8 def ready_queue(address, html): |
9 | |
8 logger.info("ready_queue:"+str(address)) | 10 logger.info("ready_queue:"+str(address)) |
11 | |
9 url = urlparse.urlparse(str(address)) | 12 url = urlparse.urlparse(str(address)) |
10 links = linkregex.findall(html) | 13 logger.info("ready_START parse BS") |
14 | |
15 | |
16 | |
17 #links = linkregex.findall(html) | |
18 | |
19 parser=BeautifulSoup(html) | |
20 logger.info("ready_END parse BS") | |
21 #tags = parser.findAll(name="a") | |
22 | |
23 | |
24 | |
11 queue = [] | 25 queue = [] |
12 for link in links: | 26 |
13 logger.info(link) | 27 for linkTag in parser.find_all('a'): |
28 link = linkTag.get("href") | |
29 | |
30 if link == None: | |
31 continue | |
32 | |
14 if link.startswith("/"): | 33 if link.startswith("/"): |
15 queue.append('http://'+url[1]+link) | 34 queue.append('http://'+url[1]+link) |
16 elif link.startswith("http") or link.startswith("https"): | 35 elif link.startswith("http") or link.startswith("https"): |
17 | 36 |
18 #DW: only MPIWG | 37 #DW: only MPIWG |