annotate MPIWGRoot_deleted_methods.py @ 1:1f2760ed3efe

indices geloescht
author dwinter
date Wed, 30 Jan 2013 11:47:21 +0100
parents
children 01b5265264b6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
1 def removeStopWords(self,xo):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
2 """remove stop words from xo"""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
3 if not hasattr(self,'_v_stopWords'):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
4 self._v_stopWords=self.stopwords_en.data.split("\n")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
5
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
6 x=str(xo)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
7
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
8 strx=x.split(" ")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
9
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
10 for tmp in strx:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
11
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
12 if tmp.lower() in self._v_stopWords:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
13 del strx[strx.index(tmp)]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
14
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
15 return " ".join(strx)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
16
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
17
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
18 def getGetNeighbourhood(self,obj, wordStr, length=100,tagging=True):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
19 """finde umgebung um die worte in wordStr, zurueckgegeben wird eine Array mit den Umgebungen von Fundstellen der Worte
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
20 alle Tags werden entfernt, die Fundstellen werden mit <span class="found">XX</span> getaggt, die Umgebungen werden
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
21 case insensitive gesucht
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
22 @param wordStr: string mit Worten getrennt durch Leerzeichen, Phrasen sind mit " gekennzeichnet
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
23 "eine phrase", "*" bezeichnet wildcards und wird ignoriert"
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
24 @param length: optional, default wert 100, 2*length ist die groesse der Umgebung
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
25 @param tagging: optional default wert true, kein span tag wird erzweugt falls tag=false
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
26 """
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
27
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
28 ret=[] # nimmt das Array auf, dass spaeter zurueckgegeben wird
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
29 ranges=[] #Array mit tupeln x,y wobei x die Position des Anfang und y des Endes der i-ten Umgebung angiebt
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
30
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
31 wordStr=wordStr.lstrip().rstrip()
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
32
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
33 def isInRanges(nr,length):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
34 """test ob eine gegeben Position nr schon irgendwo in einer Umgebung ist, gibt den Index des ersten Wertes aus ranges zurueck,
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
35 -1, wenn kein Treffer
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
36
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
37 @param nr: Position die geprueft werden soll
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
38 @param length: Laenge des Wortes das geprueft werden soll
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
39 """
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
40 for x in ranges:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
41 if (x[0]<=nr) and (nr < (x[1]-length)):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
42 return ranges.index(x)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
43 return -1
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
44
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
45 # deal with phrases, in Phrasen werden die Leerzeichen durch "_" ersetzt.
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
46 def rep_empty(str):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
47 x= re.sub(" ","_",str.group(0))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
48 return re.sub("\"","",x)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
49
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
50 wordStr=re.sub("\".*?\"", rep_empty,wordStr)#ersetze leerzeichen in " " durch "_" und loesche "
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
51
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
52 #deal with wildcards, for our purposes it is enough to delete the wildcard
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
53 wordStr=wordStr.replace("*","")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
54
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
55 words=wordStr.split(" ")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
56 #if not words is ListType:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
57 # words=[words]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
58
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
59
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
60 txtCache = self.en.getHarvestCache();
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
61 txt= txtCache.get(obj.absolute_url(),None)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
62
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
63 if txt==None:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
64
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
65 logging.debug("NO CACHE for: "+obj.absolute_url())
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
66 txt=obj.harvest_page(mode="slim")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
67
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
68
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
69 if not txt:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
70 return ret
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
71
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
72 soup = BeautifulSoup(txt)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
73
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
74 comments = soup.findAll(text=lambda text:isinstance(text, Comment))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
75 [comment.extract() for comment in comments]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
76
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
77 txt = ''.join(soup.findAll(text=True))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
78
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
79
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
80 #txt=re.sub("<.*?>", "", txt) # loesche alle Tags
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
81 for word in words:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
82 word=re.sub("_"," ",word) # ersetze zurueck "_" durch " "
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
83 pos=0
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
84
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
85 n=txt.lower().count(word.lower()) # wie oft tritt das Wort auf
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
86
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
87 for i in range(n):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
88 pos=txt.lower().find(word.lower(),pos)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
89
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
90 if pos > 0:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
91 x=max(0,pos-length)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
92 y=min(len(txt),pos+length)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
93
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
94
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
95 #is word already in one of the results
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
96 nr=isInRanges(pos,len(word))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
97 if nr >=0:# word ist in einer schon gefunden Umgebung, dann vergroessere diese
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
98 x=min(ranges[nr][0],x)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
99 y=max(ranges[nr][1],y)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
100
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
101 str=txt[x:y]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
102 if x!=0: #add dots if in the middle of text
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
103 str="..."+str
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
104
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
105 if y!=len(txt): #add dots if in the middle of text
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
106 str=str+"..."
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
107
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
108
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
109
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
110 if nr >=0: # word ist in einer schon gefunden Umgebung
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
111 ranges[nr]=(x,y) # neue Position der Umgebung
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
112
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
113 ret[nr]=str # neue Umgebung
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
114 else: # andernfalls neue Umgebung hinzufuegen
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
115 ranges.append((x,y))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
116
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
117 ret.append(str)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
118
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
119 pos=pos+len(word)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
120 else:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
121 break;
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
122
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
123 # now highlight everything
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
124 if tagging:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
125 for x in range(len(ret)):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
126 for word in words:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
127 repl=re.compile(word,re.IGNORECASE)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
128 ret[x]=repl.sub(""" <span class="found">%s</span>"""%word.upper(),ret[x])
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
129
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
130 return ret
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
131 def copyAllImagesToMargin(self):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
132 """tranformiere alle Bilder in die Margins"""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
133 projects=self.getTree()
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
134 ret=""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
135 for project in projects:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
136 proj=project[3]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
137 try:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
138 persons=proj.copyImageToMargin();
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
139 except:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
140 logging.error("Cannnot do: %s"%repr(project))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
141
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
142 def transformProjectsToId(self):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
143 """trnasformiere zu ID, Hilfsfunktion die die alten Templates analysiert und mit der neuen Liste
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
144 verantwortlicher Personen versieht"""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
145 projects=self.getTree()
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
146 ret=""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
147 for project in projects:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
148
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
149 proj=project[3]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
150 persons=proj.identifyNames(proj.getContent('xdata_01'))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
151 if not hasattr(proj,'responsibleScientistsList'):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
152 proj.responsibleScientistsList=[]
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
153
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
154 for person in persons.items():
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
155
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
156 if len(person[1]) >1: #nicht eindeutig
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
157 ret+="nicht eindeutig --- %s: %s\n"%(proj.getId(),person[0])
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
158
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
159 elif len(person[1]) ==0: #kein eintrage
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
160 ret+="kein eintrag--- %s: %s\n"%(proj.getId(),person[0])
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
161 proj.responsibleScientistsList.append((person[0],""))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
162 else:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
163 proj.responsibleScientistsList.append((person[0],person[1][0].getObject().getKey()))
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
164
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
165 return ret
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
166
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
167
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
168 def harvestProjects(self):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
169 """harvest"""
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
170 folder="/tmp"
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
171 try:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
172 os.mkdir("/tmp/harvest_MPIWG")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
173 except:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
174 pass
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
175 founds=self.ZopeFind(self.aq_parent.projects,obj_metatypes=['MPIWGProject'],search_sub=1)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
176 for found in founds:
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
177 txt=found[1].harvest_page()
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
178
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
179 if txt and (txt != ""):
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
180 name=found[0].replace("/","_")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
181 fh=file("/tmp/harvest_MPIWG/"+name,"w")
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
182 fh.write(txt)
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
183 fh.close()
1f2760ed3efe indices geloescht
dwinter
parents:
diff changeset
184