comparison ttools/views.py @ 0:af2f8fe486f6 default tip

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 17 Feb 2015 12:44:40 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:af2f8fe486f6
1 from django.shortcuts import render, redirect
2 from django.views.generic.base import View
3 from proxy.models import Server
4 import urllib
5 from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse
6
7 import json
8 import mergedict
9 from cgitb import text
10 from lxml import etree
11 import token
12 from django.template.base import Token
13 from django.views.decorators.http import require_http_methods
14 from django.template.context import RequestContext
15
16 from django.views.decorators.csrf import csrf_protect
17
18
19 class TextToolAnalyse(View):
20
21 def getLabel(self,token,lang):
22 morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
23 morphUrl+="&query=%s"%urllib.request.quote(token)
24
25 search = urllib.request.urlopen(morphUrl)
26
27 #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
28
29
30
31 print(search)
32 dom = etree.parse(search).getroot()
33
34 lem = dom.xpath("//lemma/name")[0].text
35 return lem
36
37
38
39
40 def post(self,request):
41 return self.get(request,method="post")
42
43 def get(self,request,method="get"):
44
45 #params_full= dict(request.REQUEST.dicts[1])
46
47 if method=="get":
48 try:
49 text = request.GET['text']
50
51 except:
52 return redirect("./tt/api")
53 try:
54 lang = request.GET['lang']
55 except:
56 lang="lat"
57
58 else:
59 text = request.POST['text']
60
61
62
63 try:
64 lang = request.POST['lang']
65 except:
66 lang="lat"
67
68 url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize"
69 morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
70 dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang
71
72
73
74
75 # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull
76
77
78
79
80
81 params={}
82
83
84 #text=text[0:300]#dw for the time beeing
85
86
87 chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte
88 ret = {}
89 annotations=[]
90
91 for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long
92
93 currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)]
94
95 params["inputString"]=currentText
96 params["language"]=lang
97 params["outputFormat"]="xml"
98 #params["dictionary"]="Yes"
99
100
101 searchUrl =url+"?"+urllib.parse.urlencode(params,True)
102
103
104
105 search = urllib.request.urlopen(searchUrl)
106
107 #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
108
109
110
111 print(searchUrl)
112
113 try:
114 dom = etree.parse(search)
115 root = dom.getroot()
116
117 except: #go around a bug in mpiwg service where ulrs are not properly encoded
118 search = urllib.request.urlopen(searchUrl)
119 txt=search.read().decode("utf-8")
120 txt=txt.replace("&","&amp;")
121 root =etree.fromstring(txt)
122
123
124 time = str(root.xpath("./elapsed-time-ms")[0].text)
125
126 print (time)
127
128
129 ret["time"]=ret.get("time",0)+int(time)
130
131
132 for token in root.xpath(".//token"):
133 annot={}
134
135
136
137
138
139 annot["spot"]=token.xpath("./name")[0].text
140
141 #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text
142 #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text
143 #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text
144 try:
145 annot['label'] = self.getLabel(annot['spot'],lang)
146 annot['title'] = self.getLabel(annot['spot'],lang)
147
148 annot['uri'] = dictServUrl+"&query=%s"%annot['spot']
149 annot['start']=3
150 annot['end']=5
151 annot['confidence']=0.8
152 annot['image']={}
153 #annot['type']="Web page"
154
155
156 annot['abstract']=''
157 if lang=="lat":
158
159
160 for dicts in token.xpath(".//dictionary"):
161 dictName=dicts.xpath("./name")[0].text
162 if dictName == "cooper": #choose liddle sctt
163 ctsStr=[]
164
165 try:
166 annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text
167 except:
168
169 annot['abstract']=""
170
171 elif lang=="ita":
172
173
174 for dicts in token.xpath(".//dictionary"):
175 dictName=dicts.xpath("./name")[0].text
176 if dictName == "baretti": #choose liddle sctt
177 ctsStr=[]
178
179 try:
180 #annot['abstract']=""
181 annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text
182 except:
183
184 annot['abstract']=""
185 elif lang=="grc":
186
187
188 for dicts in token.xpath(".//dictionary"):
189 dictName=dicts.xpath("./name")[0].text
190 if dictName == "lsj": #choose liddle sctt
191 ctsStr=[]
192
193 try:
194 #annot['abstract']=""
195 annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text
196 except:
197
198 annot['abstract']=""
199
200
201
202 annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"}
203 annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"]
204 annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"]
205 annotations.append(annot)
206
207
208
209 except RuntimeError as err:
210 print (err)
211 pass
212
213
214 cn = 0
215 cs = text
216 wps=[]
217 for an in annotations:
218 t=an['spot']
219 ps = cs.find(t)
220 wps.append((t,ps+cn))
221 an['start']=ps+cn
222 an['end']=ps+cn+len(t)
223
224 cn=ps+cn+len(t)
225 cs=cs[ps+len(t):]
226
227
228
229 #print (wps)
230 print ("Lenght wordlist:")
231 print (len(wps))
232
233
234
235
236
237 ret['annotations']=annotations
238 ret['lang']=lang
239
240 return JsonResponse(ret)
241
242 #for token in dom.getroot().result.tokens:
243 # print (token)
244
245
246
247