Mercurial > hg > djangoSolrSearchProxy

from django.shortcuts import render, redirect
from django.views.generic.base import View
from proxy.models import Server
import urllib
from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse

import json
import mergedict
from cgitb import text
from lxml import  etree
import token
from django.template.base import Token
from django.views.decorators.http import require_http_methods
from django.template.context import RequestContext

from django.views.decorators.csrf import csrf_protect


class TextToolAnalyse(View):

    def getLabel(self,token,lang):
        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
        morphUrl+="&query=%s"%urllib.request.quote(token)

        search = urllib.request.urlopen(morphUrl)

        #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')


        print(search)
        dom = etree.parse(search).getroot()

        lem = dom.xpath("//lemma/name")[0].text
        return lem


    def post(self,request):
        return self.get(request,method="post")

    def get(self,request,method="get"):

        #params_full= dict(request.REQUEST.dicts[1])

        if method=="get":
            try:
                text = request.GET['text']

            except:
                return redirect("./tt/api")
            try:
                lang = request.GET['lang']
            except:
                lang="lat"

        else:
            text = request.POST['text']


            try:
                lang = request.POST['lang']
            except:
                lang="lat"

        url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize"
        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
        dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang


       # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull


        params={}


        #text=text[0:300]#dw for the time beeing


        chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte
        ret = {}
        annotations=[]

        for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long

            currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)]

            params["inputString"]=currentText
            params["language"]=lang
            params["outputFormat"]="xml"
            #params["dictionary"]="Yes"


            searchUrl =url+"?"+urllib.parse.urlencode(params,True)


            search = urllib.request.urlopen(searchUrl)

            #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')


            print(searchUrl)

            try:
                dom = etree.parse(search)
                root = dom.getroot()

            except: #go around a bug in mpiwg service where ulrs are not properly encoded
                search = urllib.request.urlopen(searchUrl)
                txt=search.read().decode("utf-8")
                txt=txt.replace("&","&amp;")
                root =etree.fromstring(txt)


            time = str(root.xpath("./elapsed-time-ms")[0].text)

            print (time)


            ret["time"]=ret.get("time",0)+int(time)


            for token in root.xpath(".//token"):
                annot={}


                annot["spot"]=token.xpath("./name")[0].text

                #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text
                #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text
                #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text
                try:
                    annot['label'] = self.getLabel(annot['spot'],lang)
                    annot['title'] = self.getLabel(annot['spot'],lang)

                    annot['uri'] = dictServUrl+"&query=%s"%annot['spot']
                    annot['start']=3
                    annot['end']=5
                    annot['confidence']=0.8
                    annot['image']={}
                    #annot['type']="Web page"


                    annot['abstract']=''
                    if lang=="lat":


                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "cooper": #choose liddle sctt
                                ctsStr=[]

                                try:
                                    annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text
                                except:

                                    annot['abstract']=""

                    elif lang=="ita":


                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "baretti": #choose liddle sctt
                                ctsStr=[]

                                try:
                                    #annot['abstract']=""
                                    annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text
                                except:

                                    annot['abstract']=""
                    elif lang=="grc":


                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "lsj": #choose liddle sctt
                                ctsStr=[]

                                try:
                                    #annot['abstract']=""
                                    annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text
                                except:

                                    annot['abstract']=""


                    annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"}
                    annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"]
                    annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"]
                    annotations.append(annot)


                except RuntimeError as err:
                    print (err)
                    pass


        cn = 0
        cs = text
        wps=[]
        for an in annotations:
            t=an['spot']
            ps = cs.find(t)
            wps.append((t,ps+cn))
            an['start']=ps+cn
            an['end']=ps+cn+len(t)

            cn=ps+cn+len(t)
            cs=cs[ps+len(t):]


        #print (wps)
        print ("Lenght wordlist:")
        print (len(wps))


        ret['annotations']=annotations
        ret['lang']=lang

        return JsonResponse(ret)

        #for token in dom.getroot().result.tokens:
        #    print (token)
author	Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date	Tue, 17 Feb 2015 12:44:40 +0100
parents
children