comparison ready_queue.py @ 2:6d8b6a689b2b default tip

changed to bs4
author dwinter
date Mon, 15 Oct 2012 15:09:35 +0200
parents 768cb7284374
children
comparison
equal deleted inserted replaced
1:768cb7284374 2:6d8b6a689b2b
1 import re, urlparse 1 import re, urlparse
2 import logging 2 import logging
3 from bs4 import BeautifulSoup
3 4
4 logger = logging.getLogger("crawler_logger") 5 logger = logging.getLogger("crawler_logger")
5 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') 6 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
6 7
7 def ready_queue(address, html): 8 def ready_queue(address, html):
9
8 logger.info("ready_queue:"+str(address)) 10 logger.info("ready_queue:"+str(address))
11
9 url = urlparse.urlparse(str(address)) 12 url = urlparse.urlparse(str(address))
10 links = linkregex.findall(html) 13 logger.info("ready_START parse BS")
14
15
16
17 #links = linkregex.findall(html)
18
19 parser=BeautifulSoup(html)
20 logger.info("ready_END parse BS")
21 #tags = parser.findAll(name="a")
22
23
24
11 queue = [] 25 queue = []
12 for link in links: 26
13 logger.info(link) 27 for linkTag in parser.find_all('a'):
28 link = linkTag.get("href")
29
30 if link == None:
31 continue
32
14 if link.startswith("/"): 33 if link.startswith("/"):
15 queue.append('http://'+url[1]+link) 34 queue.append('http://'+url[1]+link)
16 elif link.startswith("http") or link.startswith("https"): 35 elif link.startswith("http") or link.startswith("https"):
17 36
18 #DW: only MPIWG 37 #DW: only MPIWG