Mercurial > hg > djangoSolrSearchProxy
view ttools/views.py @ 0:af2f8fe486f6 default tip
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 17 Feb 2015 12:44:40 +0100 |
parents | |
children |
line wrap: on
line source
from django.shortcuts import render, redirect from django.views.generic.base import View from proxy.models import Server import urllib from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse import json import mergedict from cgitb import text from lxml import etree import token from django.template.base import Token from django.views.decorators.http import require_http_methods from django.template.context import RequestContext from django.views.decorators.csrf import csrf_protect class TextToolAnalyse(View): def getLabel(self,token,lang): morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang morphUrl+="&query=%s"%urllib.request.quote(token) search = urllib.request.urlopen(morphUrl) #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') print(search) dom = etree.parse(search).getroot() lem = dom.xpath("//lemma/name")[0].text return lem def post(self,request): return self.get(request,method="post") def get(self,request,method="get"): #params_full= dict(request.REQUEST.dicts[1]) if method=="get": try: text = request.GET['text'] except: return redirect("./tt/api") try: lang = request.GET['lang'] except: lang="lat" else: text = request.POST['text'] try: lang = request.POST['lang'] except: lang="lat" url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize" morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull params={} #text=text[0:300]#dw for the time beeing chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte ret = {} annotations=[] for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)] params["inputString"]=currentText params["language"]=lang params["outputFormat"]="xml" #params["dictionary"]="Yes" searchUrl =url+"?"+urllib.parse.urlencode(params,True) search = urllib.request.urlopen(searchUrl) #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8') print(searchUrl) try: dom = etree.parse(search) root = dom.getroot() except: #go around a bug in mpiwg service where ulrs are not properly encoded search = urllib.request.urlopen(searchUrl) txt=search.read().decode("utf-8") txt=txt.replace("&","&") root =etree.fromstring(txt) time = str(root.xpath("./elapsed-time-ms")[0].text) print (time) ret["time"]=ret.get("time",0)+int(time) for token in root.xpath(".//token"): annot={} annot["spot"]=token.xpath("./name")[0].text #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text try: annot['label'] = self.getLabel(annot['spot'],lang) annot['title'] = self.getLabel(annot['spot'],lang) annot['uri'] = dictServUrl+"&query=%s"%annot['spot'] annot['start']=3 annot['end']=5 annot['confidence']=0.8 annot['image']={} #annot['type']="Web page" annot['abstract']='' if lang=="lat": for dicts in token.xpath(".//dictionary"): dictName=dicts.xpath("./name")[0].text if dictName == "cooper": #choose liddle sctt ctsStr=[] try: annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text except: annot['abstract']="" elif lang=="ita": for dicts in token.xpath(".//dictionary"): dictName=dicts.xpath("./name")[0].text if dictName == "baretti": #choose liddle sctt ctsStr=[] try: #annot['abstract']="" annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text except: annot['abstract']="" elif lang=="grc": for dicts in token.xpath(".//dictionary"): dictName=dicts.xpath("./name")[0].text if dictName == "lsj": #choose liddle sctt ctsStr=[] try: #annot['abstract']="" annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text except: annot['abstract']="" annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"} annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"] annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"] annotations.append(annot) except RuntimeError as err: print (err) pass cn = 0 cs = text wps=[] for an in annotations: t=an['spot'] ps = cs.find(t) wps.append((t,ps+cn)) an['start']=ps+cn an['end']=ps+cn+len(t) cn=ps+cn+len(t) cs=cs[ps+len(t):] #print (wps) print ("Lenght wordlist:") print (len(wps)) ret['annotations']=annotations ret['lang']=lang return JsonResponse(ret) #for token in dom.getroot().result.tokens: # print (token)