Mercurial > hg > djangoSolrSearchProxy
comparison ttools/views.py @ 0:af2f8fe486f6 default tip
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 17 Feb 2015 12:44:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:af2f8fe486f6 |
---|---|
1 from django.shortcuts import render, redirect | |
2 from django.views.generic.base import View | |
3 from proxy.models import Server | |
4 import urllib | |
5 from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse | |
6 | |
7 import json | |
8 import mergedict | |
9 from cgitb import text | |
10 from lxml import etree | |
11 import token | |
12 from django.template.base import Token | |
13 from django.views.decorators.http import require_http_methods | |
14 from django.template.context import RequestContext | |
15 | |
16 from django.views.decorators.csrf import csrf_protect | |
17 | |
18 | |
19 class TextToolAnalyse(View): | |
20 | |
21 def getLabel(self,token,lang): | |
22 morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang | |
23 morphUrl+="&query=%s"%urllib.request.quote(token) | |
24 | |
25 search = urllib.request.urlopen(morphUrl) | |
26 | |
27 #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') | |
28 | |
29 | |
30 | |
31 print(search) | |
32 dom = etree.parse(search).getroot() | |
33 | |
34 lem = dom.xpath("//lemma/name")[0].text | |
35 return lem | |
36 | |
37 | |
38 | |
39 | |
40 def post(self,request): | |
41 return self.get(request,method="post") | |
42 | |
43 def get(self,request,method="get"): | |
44 | |
45 #params_full= dict(request.REQUEST.dicts[1]) | |
46 | |
47 if method=="get": | |
48 try: | |
49 text = request.GET['text'] | |
50 | |
51 except: | |
52 return redirect("./tt/api") | |
53 try: | |
54 lang = request.GET['lang'] | |
55 except: | |
56 lang="lat" | |
57 | |
58 else: | |
59 text = request.POST['text'] | |
60 | |
61 | |
62 | |
63 try: | |
64 lang = request.POST['lang'] | |
65 except: | |
66 lang="lat" | |
67 | |
68 url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize" | |
69 morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang | |
70 dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang | |
71 | |
72 | |
73 | |
74 | |
75 # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 params={} | |
82 | |
83 | |
84 #text=text[0:300]#dw for the time beeing | |
85 | |
86 | |
87 chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte | |
88 ret = {} | |
89 annotations=[] | |
90 | |
91 for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long | |
92 | |
93 currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)] | |
94 | |
95 params["inputString"]=currentText | |
96 params["language"]=lang | |
97 params["outputFormat"]="xml" | |
98 #params["dictionary"]="Yes" | |
99 | |
100 | |
101 searchUrl =url+"?"+urllib.parse.urlencode(params,True) | |
102 | |
103 | |
104 | |
105 search = urllib.request.urlopen(searchUrl) | |
106 | |
107 #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') | |
108 | |
109 | |
110 | |
111 print(searchUrl) | |
112 | |
113 try: | |
114 dom = etree.parse(search) | |
115 root = dom.getroot() | |
116 | |
117 except: #go around a bug in mpiwg service where ulrs are not properly encoded | |
118 search = urllib.request.urlopen(searchUrl) | |
119 txt=search.read().decode("utf-8") | |
120 txt=txt.replace("&","&") | |
121 root =etree.fromstring(txt) | |
122 | |
123 | |
124 time = str(root.xpath("./elapsed-time-ms")[0].text) | |
125 | |
126 print (time) | |
127 | |
128 | |
129 ret["time"]=ret.get("time",0)+int(time) | |
130 | |
131 | |
132 for token in root.xpath(".//token"): | |
133 annot={} | |
134 | |
135 | |
136 | |
137 | |
138 | |
139 annot["spot"]=token.xpath("./name")[0].text | |
140 | |
141 #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text | |
142 #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text | |
143 #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text | |
144 try: | |
145 annot['label'] = self.getLabel(annot['spot'],lang) | |
146 annot['title'] = self.getLabel(annot['spot'],lang) | |
147 | |
148 annot['uri'] = dictServUrl+"&query=%s"%annot['spot'] | |
149 annot['start']=3 | |
150 annot['end']=5 | |
151 annot['confidence']=0.8 | |
152 annot['image']={} | |
153 #annot['type']="Web page" | |
154 | |
155 | |
156 annot['abstract']='' | |
157 if lang=="lat": | |
158 | |
159 | |
160 for dicts in token.xpath(".//dictionary"): | |
161 dictName=dicts.xpath("./name")[0].text | |
162 if dictName == "cooper": #choose liddle sctt | |
163 ctsStr=[] | |
164 | |
165 try: | |
166 annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text | |
167 except: | |
168 | |
169 annot['abstract']="" | |
170 | |
171 elif lang=="ita": | |
172 | |
173 | |
174 for dicts in token.xpath(".//dictionary"): | |
175 dictName=dicts.xpath("./name")[0].text | |
176 if dictName == "baretti": #choose liddle sctt | |
177 ctsStr=[] | |
178 | |
179 try: | |
180 #annot['abstract']="" | |
181 annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text | |
182 except: | |
183 | |
184 annot['abstract']="" | |
185 elif lang=="grc": | |
186 | |
187 | |
188 for dicts in token.xpath(".//dictionary"): | |
189 dictName=dicts.xpath("./name")[0].text | |
190 if dictName == "lsj": #choose liddle sctt | |
191 ctsStr=[] | |
192 | |
193 try: | |
194 #annot['abstract']="" | |
195 annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text | |
196 except: | |
197 | |
198 annot['abstract']="" | |
199 | |
200 | |
201 | |
202 annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"} | |
203 annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"] | |
204 annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"] | |
205 annotations.append(annot) | |
206 | |
207 | |
208 | |
209 except RuntimeError as err: | |
210 print (err) | |
211 pass | |
212 | |
213 | |
214 cn = 0 | |
215 cs = text | |
216 wps=[] | |
217 for an in annotations: | |
218 t=an['spot'] | |
219 ps = cs.find(t) | |
220 wps.append((t,ps+cn)) | |
221 an['start']=ps+cn | |
222 an['end']=ps+cn+len(t) | |
223 | |
224 cn=ps+cn+len(t) | |
225 cs=cs[ps+len(t):] | |
226 | |
227 | |
228 | |
229 #print (wps) | |
230 print ("Lenght wordlist:") | |
231 print (len(wps)) | |
232 | |
233 | |
234 | |
235 | |
236 | |
237 ret['annotations']=annotations | |
238 ret['lang']=lang | |
239 | |
240 return JsonResponse(ret) | |
241 | |
242 #for token in dom.getroot().result.tokens: | |
243 # print (token) | |
244 | |
245 | |
246 | |
247 |