Mercurial > hg > djangoSolrSearchProxy
diff ttools/views.py @ 0:af2f8fe486f6 default tip
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 17 Feb 2015 12:44:40 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttools/views.py Tue Feb 17 12:44:40 2015 +0100 @@ -0,0 +1,247 @@ +from django.shortcuts import render, redirect +from django.views.generic.base import View +from proxy.models import Server +import urllib +from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse + +import json +import mergedict +from cgitb import text +from lxml import etree +import token +from django.template.base import Token +from django.views.decorators.http import require_http_methods +from django.template.context import RequestContext + +from django.views.decorators.csrf import csrf_protect + + +class TextToolAnalyse(View): + + def getLabel(self,token,lang): + morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang + morphUrl+="&query=%s"%urllib.request.quote(token) + + search = urllib.request.urlopen(morphUrl) + + #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') + + + + print(search) + dom = etree.parse(search).getroot() + + lem = dom.xpath("//lemma/name")[0].text + return lem + + + + + def post(self,request): + return self.get(request,method="post") + + def get(self,request,method="get"): + + #params_full= dict(request.REQUEST.dicts[1]) + + if method=="get": + try: + text = request.GET['text'] + + except: + return redirect("./tt/api") + try: + lang = request.GET['lang'] + except: + lang="lat" + + else: + text = request.POST['text'] + + + + try: + lang = request.POST['lang'] + except: + lang="lat" + + url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize" + morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang + dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang + + + + + # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull + + + + + + params={} + + + #text=text[0:300]#dw for the time beeing + + + chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte + ret = {} + annotations=[] + + for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long + + currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)] + + params["inputString"]=currentText + params["language"]=lang + params["outputFormat"]="xml" + #params["dictionary"]="Yes" + + + searchUrl =url+"?"+urllib.parse.urlencode(params,True) + + + + search = urllib.request.urlopen(searchUrl) + + #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') + + + + print(searchUrl) + + try: + dom = etree.parse(search) + root = dom.getroot() + + except: #go around a bug in mpiwg service where ulrs are not properly encoded + search = urllib.request.urlopen(searchUrl) + txt=search.read().decode("utf-8") + txt=txt.replace("&","&") + root =etree.fromstring(txt) + + + time = str(root.xpath("./elapsed-time-ms")[0].text) + + print (time) + + + ret["time"]=ret.get("time",0)+int(time) + + + for token in root.xpath(".//token"): + annot={} + + + + + + annot["spot"]=token.xpath("./name")[0].text + + #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text + #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text + #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text + try: + annot['label'] = self.getLabel(annot['spot'],lang) + annot['title'] = self.getLabel(annot['spot'],lang) + + annot['uri'] = dictServUrl+"&query=%s"%annot['spot'] + annot['start']=3 + annot['end']=5 + annot['confidence']=0.8 + annot['image']={} + #annot['type']="Web page" + + + annot['abstract']='' + if lang=="lat": + + + for dicts in token.xpath(".//dictionary"): + dictName=dicts.xpath("./name")[0].text + if dictName == "cooper": #choose liddle sctt + ctsStr=[] + + try: + annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text + except: + + annot['abstract']="" + + elif lang=="ita": + + + for dicts in token.xpath(".//dictionary"): + dictName=dicts.xpath("./name")[0].text + if dictName == "baretti": #choose liddle sctt + ctsStr=[] + + try: + #annot['abstract']="" + annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text + except: + + annot['abstract']="" + elif lang=="grc": + + + for dicts in token.xpath(".//dictionary"): + dictName=dicts.xpath("./name")[0].text + if dictName == "lsj": #choose liddle sctt + ctsStr=[] + + try: + #annot['abstract']="" + annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text + except: + + annot['abstract']="" + + + + annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"} + annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"] + annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"] + annotations.append(annot) + + + + except RuntimeError as err: + print (err) + pass + + + cn = 0 + cs = text + wps=[] + for an in annotations: + t=an['spot'] + ps = cs.find(t) + wps.append((t,ps+cn)) + an['start']=ps+cn + an['end']=ps+cn+len(t) + + cn=ps+cn+len(t) + cs=cs[ps+len(t):] + + + + #print (wps) + print ("Lenght wordlist:") + print (len(wps)) + + + + + + ret['annotations']=annotations + ret['lang']=lang + + return JsonResponse(ret) + + #for token in dom.getroot().result.tokens: + # print (token) + + + +