Mercurial > hg > PyCrawler
comparison Search.py @ 0:57e2aa489383
initial
| author | dwinter |
|---|---|
| date | Fri, 12 Oct 2012 15:23:33 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:57e2aa489383 |
|---|---|
| 1 ''' | |
| 2 Created on 12.10.2012 | |
| 3 | |
| 4 @author: dwinter | |
| 5 ''' | |
| 6 from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey, DateTime, select | |
| 7 from query import CrawlerDb | |
| 8 import re | |
| 9 import logging | |
| 10 from BeautifulSoup import BeautifulSoup, Comment | |
| 11 import urllib2 | |
| 12 try: | |
| 13 from Products.PageTemplates.PageTemplateFile import PageTemplateFile | |
| 14 except: | |
| 15 class PageTemplatefile: | |
| 16 pass | |
| 17 | |
| 18 try: | |
| 19 from OFS.SimpleItem import SimpleItem | |
| 20 except: | |
| 21 class SimpleItem: | |
| 22 pass | |
| 23 class Search(SimpleItem): | |
| 24 meta_type = 'SearchCrawler' | |
| 25 def readPage(self,url): | |
| 26 | |
| 27 req = urllib2.Request(str(url)) | |
| 28 req.add_header('User-Agent', 'PyCrawler 0.2.0') | |
| 29 request = None | |
| 30 status = 0 | |
| 31 try: | |
| 32 request = urllib2.urlopen(req) | |
| 33 except urllib2.URLError, e: | |
| 34 logging.error("Exception at url: %s\n%s" % (url, e)) | |
| 35 return None | |
| 36 except urllib2.HTTPError, e: | |
| 37 status = e.code | |
| 38 if status == 0: | |
| 39 status = 200 | |
| 40 data = request.read() | |
| 41 | |
| 42 return data | |
| 43 def __init__(self): | |
| 44 self._v_db=CrawlerDb() | |
| 45 self._v_db.connect() | |
| 46 | |
| 47 | |
| 48 def search(self,keyword): | |
| 49 | |
| 50 if getattr(self, "_v_db",None)==None: | |
| 51 self.__init__(); | |
| 52 s = select([self._v_db.keyword_table]).where(self._v_db.keyword_table.c.keyword==keyword) | |
| 53 ret=[] | |
| 54 result = self._v_db.connection.execute(s) | |
| 55 logging.debug(s) | |
| 56 logging.debug(keyword) | |
| 57 logging.debug(result) | |
| 58 results= result.fetchall() | |
| 59 logging.debug(results) | |
| 60 | |
| 61 for r in results: | |
| 62 pid= r.page_id | |
| 63 logging.debug(pid) | |
| 64 s2 =select([self._v_db.crawl_table]).where(self._v_db.crawl_table.c.id==pid) | |
| 65 resultPage = self._v_db.connection.execute(s2) | |
| 66 for rp in resultPage.fetchall(): | |
| 67 ret.append((rp.address,rp.title)) | |
| 68 | |
| 69 print self.getGetNeighbourhood(rp.address, keyword) | |
| 70 | |
| 71 | |
| 72 return ret | |
| 73 | |
| 74 def getGetNeighbourhood(self,url, wordStr, length=100,tagging=True): | |
| 75 """finde umgebung um die worte in wordStr, zurueckgegeben wird eine Array mit den Umgebungen von Fundstellen der Worte | |
| 76 alle Tags werden entfernt, die Fundstellen werden mit <span class="found">XX</span> getaggt, die Umgebungen werden | |
| 77 case insensitive gesucht | |
| 78 @param wordStr: string mit Worten getrennt durch Leerzeichen, Phrasen sind mit " gekennzeichnet | |
| 79 "eine phrase", "*" bezeichnet wildcards und wird ignoriert" | |
| 80 @param length: optional, default wert 100, 2*length ist die groesse der Umgebung | |
| 81 @param tagging: optional default wert true, kein span tag wird erzweugt falls tag=false | |
| 82 """ | |
| 83 | |
| 84 ret=[] # nimmt das Array auf, dass spaeter zurueckgegeben wird | |
| 85 ranges=[] #Array mit tupeln x,y wobei x die Position des Anfang und y des Endes der i-ten Umgebung angiebt | |
| 86 | |
| 87 wordStr=wordStr.lstrip().rstrip() | |
| 88 | |
| 89 def isInRanges(nr,length): | |
| 90 """test ob eine gegeben Position nr schon irgendwo in einer Umgebung ist, gibt den Index des ersten Wertes aus ranges zurueck, | |
| 91 -1, wenn kein Treffer | |
| 92 | |
| 93 @param nr: Position die geprueft werden soll | |
| 94 @param length: Laenge des Wortes das geprueft werden soll | |
| 95 """ | |
| 96 for x in ranges: | |
| 97 if (x[0]<=nr) and (nr < (x[1]-length)): | |
| 98 return ranges.index(x) | |
| 99 return -1 | |
| 100 | |
| 101 # deal with phrases, in Phrasen werden die Leerzeichen durch "_" ersetzt. | |
| 102 def rep_empty(str): | |
| 103 x= re.sub(" ","_",str.group(0)) | |
| 104 return re.sub("\"","",x) | |
| 105 | |
| 106 wordStr=re.sub("\".*?\"", rep_empty,wordStr)#ersetze leerzeichen in " " durch "_" und loesche " | |
| 107 | |
| 108 #deal with wildcards, for our purposes it is enough to delete the wildcard | |
| 109 wordStr=wordStr.replace("*","") | |
| 110 | |
| 111 words=wordStr.split(" ") | |
| 112 #if not words is ListType: | |
| 113 # words=[words] | |
| 114 | |
| 115 | |
| 116 txtCache = self.en.getHarvestCache(); | |
| 117 txt= txtCache.get(url,None) | |
| 118 | |
| 119 txt=None | |
| 120 if txt==None: | |
| 121 | |
| 122 logging.debug("NO CACHE for: "+url) | |
| 123 | |
| 124 | |
| 125 | |
| 126 txt=self.readPage(url) | |
| 127 | |
| 128 | |
| 129 if not txt: | |
| 130 return ret | |
| 131 | |
| 132 soup = BeautifulSoup(txt) | |
| 133 | |
| 134 comments = soup.findAll(text=lambda text:isinstance(text, Comment)) | |
| 135 [comment.extract() for comment in comments] | |
| 136 | |
| 137 txt = ''.join(soup.findAll(text=True)) | |
| 138 | |
| 139 | |
| 140 #txt=re.sub("<.*?>", "", txt) # loesche alle Tags | |
| 141 for word in words: | |
| 142 word=re.sub("_"," ",word) # ersetze zurueck "_" durch " " | |
| 143 pos=0 | |
| 144 | |
| 145 n=txt.lower().count(word.lower()) # wie oft tritt das Wort auf | |
| 146 | |
| 147 for i in range(n): | |
| 148 pos=txt.lower().find(word.lower(),pos) | |
| 149 | |
| 150 if pos > 0: | |
| 151 x=max(0,pos-length) | |
| 152 y=min(len(txt),pos+length) | |
| 153 | |
| 154 | |
| 155 #is word already in one of the results | |
| 156 nr=isInRanges(pos,len(word)) | |
| 157 if nr >=0:# word ist in einer schon gefunden Umgebung, dann vergroessere diese | |
| 158 x=min(ranges[nr][0],x) | |
| 159 y=max(ranges[nr][1],y) | |
| 160 | |
| 161 str=txt[x:y] | |
| 162 if x!=0: #add dots if in the middle of text | |
| 163 str="..."+str | |
| 164 | |
| 165 if y!=len(txt): #add dots if in the middle of text | |
| 166 str=str+"..." | |
| 167 | |
| 168 | |
| 169 | |
| 170 if nr >=0: # word ist in einer schon gefunden Umgebung | |
| 171 ranges[nr]=(x,y) # neue Position der Umgebung | |
| 172 | |
| 173 ret[nr]=str # neue Umgebung | |
| 174 else: # andernfalls neue Umgebung hinzufuegen | |
| 175 ranges.append((x,y)) | |
| 176 | |
| 177 ret.append(str) | |
| 178 | |
| 179 pos=pos+len(word) | |
| 180 else: | |
| 181 break; | |
| 182 | |
| 183 # now highlight everything | |
| 184 if tagging: | |
| 185 for x in range(len(ret)): | |
| 186 for word in words: | |
| 187 repl=re.compile(word,re.IGNORECASE) | |
| 188 ret[x]=repl.sub(""" <span class="found">%s</span>"""%word.upper(),ret[x]) | |
| 189 | |
| 190 return ret | |
| 191 | |
| 192 def manage_addSearch(self,id,REQUEST): | |
| 193 """ create the new MPIWGManager """ | |
| 194 newinst = Search() | |
| 195 self._setObject(id, newinst) | |
| 196 | |
| 197 REQUEST.RESPONSE.redirect('manage_main') | |
| 198 | |
| 199 try: | |
| 200 manage_addSearchForm = PageTemplateFile('zpt/addSearch.pt',globals()) | |
| 201 except: | |
| 202 pass | |
| 203 | |
| 204 if __name__ == '__main__': | |
| 205 | |
| 206 logging.getLogger().setLevel(logging.DEBUG) | |
| 207 s = Search() | |
| 208 print s.search("Kuhn") | |
| 209 |
