annotate Search.py @ 2:6d8b6a689b2b default tip

changed to bs4
author dwinter
date Mon, 15 Oct 2012 15:09:35 +0200
parents 57e2aa489383
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e2aa489383 initial
dwinter
parents:
diff changeset
1 '''
57e2aa489383 initial
dwinter
parents:
diff changeset
2 Created on 12.10.2012
57e2aa489383 initial
dwinter
parents:
diff changeset
3
57e2aa489383 initial
dwinter
parents:
diff changeset
4 @author: dwinter
57e2aa489383 initial
dwinter
parents:
diff changeset
5 '''
57e2aa489383 initial
dwinter
parents:
diff changeset
6 from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey, DateTime, select
57e2aa489383 initial
dwinter
parents:
diff changeset
7 from query import CrawlerDb
57e2aa489383 initial
dwinter
parents:
diff changeset
8 import re
57e2aa489383 initial
dwinter
parents:
diff changeset
9 import logging
57e2aa489383 initial
dwinter
parents:
diff changeset
10 from BeautifulSoup import BeautifulSoup, Comment
57e2aa489383 initial
dwinter
parents:
diff changeset
11 import urllib2
57e2aa489383 initial
dwinter
parents:
diff changeset
12 try:
57e2aa489383 initial
dwinter
parents:
diff changeset
13 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
57e2aa489383 initial
dwinter
parents:
diff changeset
14 except:
57e2aa489383 initial
dwinter
parents:
diff changeset
15 class PageTemplatefile:
57e2aa489383 initial
dwinter
parents:
diff changeset
16 pass
57e2aa489383 initial
dwinter
parents:
diff changeset
17
57e2aa489383 initial
dwinter
parents:
diff changeset
18 try:
57e2aa489383 initial
dwinter
parents:
diff changeset
19 from OFS.SimpleItem import SimpleItem
57e2aa489383 initial
dwinter
parents:
diff changeset
20 except:
57e2aa489383 initial
dwinter
parents:
diff changeset
21 class SimpleItem:
57e2aa489383 initial
dwinter
parents:
diff changeset
22 pass
57e2aa489383 initial
dwinter
parents:
diff changeset
23 class Search(SimpleItem):
57e2aa489383 initial
dwinter
parents:
diff changeset
24 meta_type = 'SearchCrawler'
57e2aa489383 initial
dwinter
parents:
diff changeset
25 def readPage(self,url):
57e2aa489383 initial
dwinter
parents:
diff changeset
26
57e2aa489383 initial
dwinter
parents:
diff changeset
27 req = urllib2.Request(str(url))
57e2aa489383 initial
dwinter
parents:
diff changeset
28 req.add_header('User-Agent', 'PyCrawler 0.2.0')
57e2aa489383 initial
dwinter
parents:
diff changeset
29 request = None
57e2aa489383 initial
dwinter
parents:
diff changeset
30 status = 0
57e2aa489383 initial
dwinter
parents:
diff changeset
31 try:
57e2aa489383 initial
dwinter
parents:
diff changeset
32 request = urllib2.urlopen(req)
57e2aa489383 initial
dwinter
parents:
diff changeset
33 except urllib2.URLError, e:
57e2aa489383 initial
dwinter
parents:
diff changeset
34 logging.error("Exception at url: %s\n%s" % (url, e))
57e2aa489383 initial
dwinter
parents:
diff changeset
35 return None
57e2aa489383 initial
dwinter
parents:
diff changeset
36 except urllib2.HTTPError, e:
57e2aa489383 initial
dwinter
parents:
diff changeset
37 status = e.code
57e2aa489383 initial
dwinter
parents:
diff changeset
38 if status == 0:
57e2aa489383 initial
dwinter
parents:
diff changeset
39 status = 200
57e2aa489383 initial
dwinter
parents:
diff changeset
40 data = request.read()
57e2aa489383 initial
dwinter
parents:
diff changeset
41
57e2aa489383 initial
dwinter
parents:
diff changeset
42 return data
57e2aa489383 initial
dwinter
parents:
diff changeset
43 def __init__(self):
57e2aa489383 initial
dwinter
parents:
diff changeset
44 self._v_db=CrawlerDb()
57e2aa489383 initial
dwinter
parents:
diff changeset
45 self._v_db.connect()
57e2aa489383 initial
dwinter
parents:
diff changeset
46
57e2aa489383 initial
dwinter
parents:
diff changeset
47
57e2aa489383 initial
dwinter
parents:
diff changeset
48 def search(self,keyword):
57e2aa489383 initial
dwinter
parents:
diff changeset
49
57e2aa489383 initial
dwinter
parents:
diff changeset
50 if getattr(self, "_v_db",None)==None:
57e2aa489383 initial
dwinter
parents:
diff changeset
51 self.__init__();
57e2aa489383 initial
dwinter
parents:
diff changeset
52 s = select([self._v_db.keyword_table]).where(self._v_db.keyword_table.c.keyword==keyword)
57e2aa489383 initial
dwinter
parents:
diff changeset
53 ret=[]
57e2aa489383 initial
dwinter
parents:
diff changeset
54 result = self._v_db.connection.execute(s)
57e2aa489383 initial
dwinter
parents:
diff changeset
55 logging.debug(s)
57e2aa489383 initial
dwinter
parents:
diff changeset
56 logging.debug(keyword)
57e2aa489383 initial
dwinter
parents:
diff changeset
57 logging.debug(result)
57e2aa489383 initial
dwinter
parents:
diff changeset
58 results= result.fetchall()
57e2aa489383 initial
dwinter
parents:
diff changeset
59 logging.debug(results)
57e2aa489383 initial
dwinter
parents:
diff changeset
60
57e2aa489383 initial
dwinter
parents:
diff changeset
61 for r in results:
57e2aa489383 initial
dwinter
parents:
diff changeset
62 pid= r.page_id
57e2aa489383 initial
dwinter
parents:
diff changeset
63 logging.debug(pid)
57e2aa489383 initial
dwinter
parents:
diff changeset
64 s2 =select([self._v_db.crawl_table]).where(self._v_db.crawl_table.c.id==pid)
57e2aa489383 initial
dwinter
parents:
diff changeset
65 resultPage = self._v_db.connection.execute(s2)
57e2aa489383 initial
dwinter
parents:
diff changeset
66 for rp in resultPage.fetchall():
57e2aa489383 initial
dwinter
parents:
diff changeset
67 ret.append((rp.address,rp.title))
57e2aa489383 initial
dwinter
parents:
diff changeset
68
57e2aa489383 initial
dwinter
parents:
diff changeset
69 print self.getGetNeighbourhood(rp.address, keyword)
57e2aa489383 initial
dwinter
parents:
diff changeset
70
57e2aa489383 initial
dwinter
parents:
diff changeset
71
57e2aa489383 initial
dwinter
parents:
diff changeset
72 return ret
57e2aa489383 initial
dwinter
parents:
diff changeset
73
57e2aa489383 initial
dwinter
parents:
diff changeset
74 def getGetNeighbourhood(self,url, wordStr, length=100,tagging=True):
57e2aa489383 initial
dwinter
parents:
diff changeset
75 """finde umgebung um die worte in wordStr, zurueckgegeben wird eine Array mit den Umgebungen von Fundstellen der Worte
57e2aa489383 initial
dwinter
parents:
diff changeset
76 alle Tags werden entfernt, die Fundstellen werden mit <span class="found">XX</span> getaggt, die Umgebungen werden
57e2aa489383 initial
dwinter
parents:
diff changeset
77 case insensitive gesucht
57e2aa489383 initial
dwinter
parents:
diff changeset
78 @param wordStr: string mit Worten getrennt durch Leerzeichen, Phrasen sind mit " gekennzeichnet
57e2aa489383 initial
dwinter
parents:
diff changeset
79 "eine phrase", "*" bezeichnet wildcards und wird ignoriert"
57e2aa489383 initial
dwinter
parents:
diff changeset
80 @param length: optional, default wert 100, 2*length ist die groesse der Umgebung
57e2aa489383 initial
dwinter
parents:
diff changeset
81 @param tagging: optional default wert true, kein span tag wird erzweugt falls tag=false
57e2aa489383 initial
dwinter
parents:
diff changeset
82 """
57e2aa489383 initial
dwinter
parents:
diff changeset
83
57e2aa489383 initial
dwinter
parents:
diff changeset
84 ret=[] # nimmt das Array auf, dass spaeter zurueckgegeben wird
57e2aa489383 initial
dwinter
parents:
diff changeset
85 ranges=[] #Array mit tupeln x,y wobei x die Position des Anfang und y des Endes der i-ten Umgebung angiebt
57e2aa489383 initial
dwinter
parents:
diff changeset
86
57e2aa489383 initial
dwinter
parents:
diff changeset
87 wordStr=wordStr.lstrip().rstrip()
57e2aa489383 initial
dwinter
parents:
diff changeset
88
57e2aa489383 initial
dwinter
parents:
diff changeset
89 def isInRanges(nr,length):
57e2aa489383 initial
dwinter
parents:
diff changeset
90 """test ob eine gegeben Position nr schon irgendwo in einer Umgebung ist, gibt den Index des ersten Wertes aus ranges zurueck,
57e2aa489383 initial
dwinter
parents:
diff changeset
91 -1, wenn kein Treffer
57e2aa489383 initial
dwinter
parents:
diff changeset
92
57e2aa489383 initial
dwinter
parents:
diff changeset
93 @param nr: Position die geprueft werden soll
57e2aa489383 initial
dwinter
parents:
diff changeset
94 @param length: Laenge des Wortes das geprueft werden soll
57e2aa489383 initial
dwinter
parents:
diff changeset
95 """
57e2aa489383 initial
dwinter
parents:
diff changeset
96 for x in ranges:
57e2aa489383 initial
dwinter
parents:
diff changeset
97 if (x[0]<=nr) and (nr < (x[1]-length)):
57e2aa489383 initial
dwinter
parents:
diff changeset
98 return ranges.index(x)
57e2aa489383 initial
dwinter
parents:
diff changeset
99 return -1
57e2aa489383 initial
dwinter
parents:
diff changeset
100
57e2aa489383 initial
dwinter
parents:
diff changeset
101 # deal with phrases, in Phrasen werden die Leerzeichen durch "_" ersetzt.
57e2aa489383 initial
dwinter
parents:
diff changeset
102 def rep_empty(str):
57e2aa489383 initial
dwinter
parents:
diff changeset
103 x= re.sub(" ","_",str.group(0))
57e2aa489383 initial
dwinter
parents:
diff changeset
104 return re.sub("\"","",x)
57e2aa489383 initial
dwinter
parents:
diff changeset
105
57e2aa489383 initial
dwinter
parents:
diff changeset
106 wordStr=re.sub("\".*?\"", rep_empty,wordStr)#ersetze leerzeichen in " " durch "_" und loesche "
57e2aa489383 initial
dwinter
parents:
diff changeset
107
57e2aa489383 initial
dwinter
parents:
diff changeset
108 #deal with wildcards, for our purposes it is enough to delete the wildcard
57e2aa489383 initial
dwinter
parents:
diff changeset
109 wordStr=wordStr.replace("*","")
57e2aa489383 initial
dwinter
parents:
diff changeset
110
57e2aa489383 initial
dwinter
parents:
diff changeset
111 words=wordStr.split(" ")
57e2aa489383 initial
dwinter
parents:
diff changeset
112 #if not words is ListType:
57e2aa489383 initial
dwinter
parents:
diff changeset
113 # words=[words]
57e2aa489383 initial
dwinter
parents:
diff changeset
114
57e2aa489383 initial
dwinter
parents:
diff changeset
115
57e2aa489383 initial
dwinter
parents:
diff changeset
116 txtCache = self.en.getHarvestCache();
57e2aa489383 initial
dwinter
parents:
diff changeset
117 txt= txtCache.get(url,None)
57e2aa489383 initial
dwinter
parents:
diff changeset
118
57e2aa489383 initial
dwinter
parents:
diff changeset
119 txt=None
57e2aa489383 initial
dwinter
parents:
diff changeset
120 if txt==None:
57e2aa489383 initial
dwinter
parents:
diff changeset
121
57e2aa489383 initial
dwinter
parents:
diff changeset
122 logging.debug("NO CACHE for: "+url)
57e2aa489383 initial
dwinter
parents:
diff changeset
123
57e2aa489383 initial
dwinter
parents:
diff changeset
124
57e2aa489383 initial
dwinter
parents:
diff changeset
125
57e2aa489383 initial
dwinter
parents:
diff changeset
126 txt=self.readPage(url)
57e2aa489383 initial
dwinter
parents:
diff changeset
127
57e2aa489383 initial
dwinter
parents:
diff changeset
128
57e2aa489383 initial
dwinter
parents:
diff changeset
129 if not txt:
57e2aa489383 initial
dwinter
parents:
diff changeset
130 return ret
57e2aa489383 initial
dwinter
parents:
diff changeset
131
57e2aa489383 initial
dwinter
parents:
diff changeset
132 soup = BeautifulSoup(txt)
57e2aa489383 initial
dwinter
parents:
diff changeset
133
57e2aa489383 initial
dwinter
parents:
diff changeset
134 comments = soup.findAll(text=lambda text:isinstance(text, Comment))
57e2aa489383 initial
dwinter
parents:
diff changeset
135 [comment.extract() for comment in comments]
57e2aa489383 initial
dwinter
parents:
diff changeset
136
57e2aa489383 initial
dwinter
parents:
diff changeset
137 txt = ''.join(soup.findAll(text=True))
57e2aa489383 initial
dwinter
parents:
diff changeset
138
57e2aa489383 initial
dwinter
parents:
diff changeset
139
57e2aa489383 initial
dwinter
parents:
diff changeset
140 #txt=re.sub("<.*?>", "", txt) # loesche alle Tags
57e2aa489383 initial
dwinter
parents:
diff changeset
141 for word in words:
57e2aa489383 initial
dwinter
parents:
diff changeset
142 word=re.sub("_"," ",word) # ersetze zurueck "_" durch " "
57e2aa489383 initial
dwinter
parents:
diff changeset
143 pos=0
57e2aa489383 initial
dwinter
parents:
diff changeset
144
57e2aa489383 initial
dwinter
parents:
diff changeset
145 n=txt.lower().count(word.lower()) # wie oft tritt das Wort auf
57e2aa489383 initial
dwinter
parents:
diff changeset
146
57e2aa489383 initial
dwinter
parents:
diff changeset
147 for i in range(n):
57e2aa489383 initial
dwinter
parents:
diff changeset
148 pos=txt.lower().find(word.lower(),pos)
57e2aa489383 initial
dwinter
parents:
diff changeset
149
57e2aa489383 initial
dwinter
parents:
diff changeset
150 if pos > 0:
57e2aa489383 initial
dwinter
parents:
diff changeset
151 x=max(0,pos-length)
57e2aa489383 initial
dwinter
parents:
diff changeset
152 y=min(len(txt),pos+length)
57e2aa489383 initial
dwinter
parents:
diff changeset
153
57e2aa489383 initial
dwinter
parents:
diff changeset
154
57e2aa489383 initial
dwinter
parents:
diff changeset
155 #is word already in one of the results
57e2aa489383 initial
dwinter
parents:
diff changeset
156 nr=isInRanges(pos,len(word))
57e2aa489383 initial
dwinter
parents:
diff changeset
157 if nr >=0:# word ist in einer schon gefunden Umgebung, dann vergroessere diese
57e2aa489383 initial
dwinter
parents:
diff changeset
158 x=min(ranges[nr][0],x)
57e2aa489383 initial
dwinter
parents:
diff changeset
159 y=max(ranges[nr][1],y)
57e2aa489383 initial
dwinter
parents:
diff changeset
160
57e2aa489383 initial
dwinter
parents:
diff changeset
161 str=txt[x:y]
57e2aa489383 initial
dwinter
parents:
diff changeset
162 if x!=0: #add dots if in the middle of text
57e2aa489383 initial
dwinter
parents:
diff changeset
163 str="..."+str
57e2aa489383 initial
dwinter
parents:
diff changeset
164
57e2aa489383 initial
dwinter
parents:
diff changeset
165 if y!=len(txt): #add dots if in the middle of text
57e2aa489383 initial
dwinter
parents:
diff changeset
166 str=str+"..."
57e2aa489383 initial
dwinter
parents:
diff changeset
167
57e2aa489383 initial
dwinter
parents:
diff changeset
168
57e2aa489383 initial
dwinter
parents:
diff changeset
169
57e2aa489383 initial
dwinter
parents:
diff changeset
170 if nr >=0: # word ist in einer schon gefunden Umgebung
57e2aa489383 initial
dwinter
parents:
diff changeset
171 ranges[nr]=(x,y) # neue Position der Umgebung
57e2aa489383 initial
dwinter
parents:
diff changeset
172
57e2aa489383 initial
dwinter
parents:
diff changeset
173 ret[nr]=str # neue Umgebung
57e2aa489383 initial
dwinter
parents:
diff changeset
174 else: # andernfalls neue Umgebung hinzufuegen
57e2aa489383 initial
dwinter
parents:
diff changeset
175 ranges.append((x,y))
57e2aa489383 initial
dwinter
parents:
diff changeset
176
57e2aa489383 initial
dwinter
parents:
diff changeset
177 ret.append(str)
57e2aa489383 initial
dwinter
parents:
diff changeset
178
57e2aa489383 initial
dwinter
parents:
diff changeset
179 pos=pos+len(word)
57e2aa489383 initial
dwinter
parents:
diff changeset
180 else:
57e2aa489383 initial
dwinter
parents:
diff changeset
181 break;
57e2aa489383 initial
dwinter
parents:
diff changeset
182
57e2aa489383 initial
dwinter
parents:
diff changeset
183 # now highlight everything
57e2aa489383 initial
dwinter
parents:
diff changeset
184 if tagging:
57e2aa489383 initial
dwinter
parents:
diff changeset
185 for x in range(len(ret)):
57e2aa489383 initial
dwinter
parents:
diff changeset
186 for word in words:
57e2aa489383 initial
dwinter
parents:
diff changeset
187 repl=re.compile(word,re.IGNORECASE)
57e2aa489383 initial
dwinter
parents:
diff changeset
188 ret[x]=repl.sub(""" <span class="found">%s</span>"""%word.upper(),ret[x])
57e2aa489383 initial
dwinter
parents:
diff changeset
189
57e2aa489383 initial
dwinter
parents:
diff changeset
190 return ret
57e2aa489383 initial
dwinter
parents:
diff changeset
191
57e2aa489383 initial
dwinter
parents:
diff changeset
192 def manage_addSearch(self,id,REQUEST):
57e2aa489383 initial
dwinter
parents:
diff changeset
193 """ create the new MPIWGManager """
57e2aa489383 initial
dwinter
parents:
diff changeset
194 newinst = Search()
57e2aa489383 initial
dwinter
parents:
diff changeset
195 self._setObject(id, newinst)
57e2aa489383 initial
dwinter
parents:
diff changeset
196
57e2aa489383 initial
dwinter
parents:
diff changeset
197 REQUEST.RESPONSE.redirect('manage_main')
57e2aa489383 initial
dwinter
parents:
diff changeset
198
57e2aa489383 initial
dwinter
parents:
diff changeset
199 try:
57e2aa489383 initial
dwinter
parents:
diff changeset
200 manage_addSearchForm = PageTemplateFile('zpt/addSearch.pt',globals())
57e2aa489383 initial
dwinter
parents:
diff changeset
201 except:
57e2aa489383 initial
dwinter
parents:
diff changeset
202 pass
57e2aa489383 initial
dwinter
parents:
diff changeset
203
57e2aa489383 initial
dwinter
parents:
diff changeset
204 if __name__ == '__main__':
57e2aa489383 initial
dwinter
parents:
diff changeset
205
57e2aa489383 initial
dwinter
parents:
diff changeset
206 logging.getLogger().setLevel(logging.DEBUG)
57e2aa489383 initial
dwinter
parents:
diff changeset
207 s = Search()
57e2aa489383 initial
dwinter
parents:
diff changeset
208 print s.search("Kuhn")
57e2aa489383 initial
dwinter
parents:
diff changeset
209