view ttools/views.py @ 0:af2f8fe486f6 default tip

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 17 Feb 2015 12:44:40 +0100
parents
children
line wrap: on
line source

from django.shortcuts import render, redirect
from django.views.generic.base import View
from proxy.models import Server
import urllib
from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse

import json
import mergedict
from cgitb import text
from lxml import  etree
import token
from django.template.base import Token
from django.views.decorators.http import require_http_methods
from django.template.context import RequestContext

from django.views.decorators.csrf import csrf_protect


class TextToolAnalyse(View):
    
    def getLabel(self,token,lang):
        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
        morphUrl+="&query=%s"%urllib.request.quote(token)
        
        search = urllib.request.urlopen(morphUrl)
                   
        #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
        
        
        
        print(search)
        dom = etree.parse(search).getroot()
        
        lem = dom.xpath("//lemma/name")[0].text
        return lem
        
    
    
    
    def post(self,request):
        return self.get(request,method="post")
    
    def get(self,request,method="get"):
      
        #params_full= dict(request.REQUEST.dicts[1])
        
        if method=="get":
            try:
                text = request.GET['text']
            
            except:
                return redirect("./tt/api")
            try:
                lang = request.GET['lang']
            except:
                lang="lat"
        
        else:
            text = request.POST['text']
        
            
                
            try:
                lang = request.POST['lang']
            except:
                lang="lat"
                
        url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize"
        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
        dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang
        
        


       # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull
        
        
        
        
        
        params={}
        
        
        #text=text[0:300]#dw for the time beeing
        
        
        chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte
        ret = {}
        annotations=[]
            
        for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long
            
            currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)]
        
            params["inputString"]=currentText
            params["language"]=lang
            params["outputFormat"]="xml"
            #params["dictionary"]="Yes"
            
            
            searchUrl =url+"?"+urllib.parse.urlencode(params,True)
                        
                        
                       
            search = urllib.request.urlopen(searchUrl)
                       
            #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
            
            
            
            print(searchUrl)
            
            try:
                dom = etree.parse(search)
                root = dom.getroot()
           
            except: #go around a bug in mpiwg service where ulrs are not properly encoded
                search = urllib.request.urlopen(searchUrl)
                txt=search.read().decode("utf-8")
                txt=txt.replace("&","&amp;")
                root =etree.fromstring(txt)
                
            
            time = str(root.xpath("./elapsed-time-ms")[0].text)
            
            print (time)
            
            
            ret["time"]=ret.get("time",0)+int(time)
            
          
            for token in root.xpath(".//token"):
                annot={}
               
                
                
                
                
                annot["spot"]=token.xpath("./name")[0].text
               
                #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text
                #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text
                #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text
                try:
                    annot['label'] = self.getLabel(annot['spot'],lang)
                    annot['title'] = self.getLabel(annot['spot'],lang)
                    
                    annot['uri'] = dictServUrl+"&query=%s"%annot['spot']
                    annot['start']=3
                    annot['end']=5
                    annot['confidence']=0.8
                    annot['image']={}
                    #annot['type']="Web page"
                    
                    
                    annot['abstract']=''
                    if lang=="lat":
                        
                      
                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "cooper": #choose liddle sctt
                                ctsStr=[]
                               
                                try:
                                    annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text
                                except:
                                
                                    annot['abstract']=""    
                                    
                    elif lang=="ita":
                        
                      
                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "baretti": #choose liddle sctt
                                ctsStr=[]
                               
                                try:
                                    #annot['abstract']=""    
                                    annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text
                                except:
                                
                                    annot['abstract']=""    
                    elif lang=="grc":
                        
                      
                        for dicts in token.xpath(".//dictionary"):
                            dictName=dicts.xpath("./name")[0].text
                            if dictName == "lsj": #choose liddle sctt
                                ctsStr=[]
                               
                                try:
                                    #annot['abstract']=""    
                                    annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text
                                except:
                                
                                    annot['abstract']=""    
                    
                    
                   
                    annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"}
                    annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"]
                    annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"]
                    annotations.append(annot)
                    
                   
                   
                except RuntimeError as err:
                    print (err)
                    pass
                    
            
        cn = 0
        cs = text
        wps=[]
        for an in annotations:
            t=an['spot']
            ps = cs.find(t)
            wps.append((t,ps+cn))
            an['start']=ps+cn
            an['end']=ps+cn+len(t)
        
            cn=ps+cn+len(t)
            cs=cs[ps+len(t):]
           
        
        
        #print (wps)
        print ("Lenght wordlist:")
        print (len(wps))
        
            
        
        
                
        ret['annotations']=annotations
        ret['lang']=lang
        
        return JsonResponse(ret)
   
        #for token in dom.getroot().result.tokens:
        #    print (token)