PyCrawler: ready_queue.py comparison

changed to bs4

comparison

equal deleted inserted replaced

-:768cb7284374
+:6d8b6a689b2b
 import re, urlparse
 import logging
+from bs4 import BeautifulSoup
 logger = logging.getLogger("crawler_logger")
 linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
 def ready_queue(address, html):
 	logger.info("ready_queue:"+str(address))
 	url = urlparse.urlparse(str(address))
-	links = linkregex.findall(html)
+	logger.info("ready_START  parse BS")
+	#links = linkregex.findall(html)
+	parser=BeautifulSoup(html)
+	logger.info("ready_END  parse BS")
+	#tags = parser.findAll(name="a")
 	queue = []
-	for link in links:
-		logger.info(link)
+	for linkTag in parser.find_all('a'):
+		link = linkTag.get("href")
+		if link == None:
+			continue
 		if link.startswith("/"):
 			queue.append('http://'+url[1]+link)
 		elif link.startswith("http") or link.startswith("https"):
 			#DW: only MPIWG

Mercurial > hg > PyCrawler