diff ttools/views.py @ 0:af2f8fe486f6 default tip

initial
author Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de>
date Tue, 17 Feb 2015 12:44:40 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttools/views.py	Tue Feb 17 12:44:40 2015 +0100
@@ -0,0 +1,247 @@
+from django.shortcuts import render, redirect
+from django.views.generic.base import View
+from proxy.models import Server
+import urllib
+from django.http.response import StreamingHttpResponse, HttpResponse,JsonResponse
+
+import json
+import mergedict
+from cgitb import text
+from lxml import  etree
+import token
+from django.template.base import Token
+from django.views.decorators.http import require_http_methods
+from django.template.context import RequestContext
+
+from django.views.decorators.csrf import csrf_protect
+
+
+class TextToolAnalyse(View):
+    
+    def getLabel(self,token,lang):
+        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
+        morphUrl+="&query=%s"%urllib.request.quote(token)
+        
+        search = urllib.request.urlopen(morphUrl)
+                   
+        #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
+        
+        
+        
+        print(search)
+        dom = etree.parse(search).getroot()
+        
+        lem = dom.xpath("//lemma/name")[0].text
+        return lem
+        
+    
+    
+    
+    def post(self,request):
+        return self.get(request,method="post")
+    
+    def get(self,request,method="get"):
+      
+        #params_full= dict(request.REQUEST.dicts[1])
+        
+        if method=="get":
+            try:
+                text = request.GET['text']
+            
+            except:
+                return redirect("./tt/api")
+            try:
+                lang = request.GET['lang']
+            except:
+                lang="lat"
+        
+        else:
+            text = request.POST['text']
+        
+            
+                
+            try:
+                lang = request.POST['lang']
+            except:
+                lang="lat"
+                
+        url="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/text/Tokenize"
+        morphUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=xml&outputType=morphCompact"%lang
+        dictServUrl="http://mpdl-service.mpiwg-berlin.mpg.de/mpiwg-mpdl-lt-web/lt/GetDictionaryEntries?language=%s&outputFormat=html&outputType=morphCompact&outputType=dictFull"%lang
+        
+        
+
+
+       # query=preface&queryDisplay=Preface&language=en&outputFormat=html&outputType=morphCompact&outputType=dictFull
+        
+        
+        
+        
+        
+        params={}
+        
+        
+        #text=text[0:300]#dw for the time beeing
+        
+        
+        chunks = int(len(text)/300)+1 # zerlege den text in 300 schritte
+        ret = {}
+        annotations=[]
+            
+        for chunk in range(min(10,chunks)): #maximal 5 z.Z. otherwise to long
+            
+            currentText=text[chunk*300:min(len(text),(chunk+1)*300-1)]
+        
+            params["inputString"]=currentText
+            params["language"]=lang
+            params["outputFormat"]="xml"
+            #params["dictionary"]="Yes"
+            
+            
+            searchUrl =url+"?"+urllib.parse.urlencode(params,True)
+                        
+                        
+                       
+            search = urllib.request.urlopen(searchUrl)
+                       
+            #txt = urllib.request.urlopen(searchUrl).read().decode('utf-8')
+            
+            
+            
+            print(searchUrl)
+            
+            try:
+                dom = etree.parse(search)
+                root = dom.getroot()
+           
+            except: #go around a bug in mpiwg service where ulrs are not properly encoded
+                search = urllib.request.urlopen(searchUrl)
+                txt=search.read().decode("utf-8")
+                txt=txt.replace("&","&amp;")
+                root =etree.fromstring(txt)
+                
+            
+            time = str(root.xpath("./elapsed-time-ms")[0].text)
+            
+            print (time)
+            
+            
+            ret["time"]=ret.get("time",0)+int(time)
+            
+          
+            for token in root.xpath(".//token"):
+                annot={}
+               
+                
+                
+                
+                
+                annot["spot"]=token.xpath("./name")[0].text
+               
+                #annot['title'] = token.xpath(".//dictionary/entries/entry/form")[0].text
+                #annot['label'] = token.xpath(".//dictionary/entries/entry/form")[0].text
+                #annot['uri'] = token.xpath(".//dictionary/entries/entry/remoteUrl")[0].text
+                try:
+                    annot['label'] = self.getLabel(annot['spot'],lang)
+                    annot['title'] = self.getLabel(annot['spot'],lang)
+                    
+                    annot['uri'] = dictServUrl+"&query=%s"%annot['spot']
+                    annot['start']=3
+                    annot['end']=5
+                    annot['confidence']=0.8
+                    annot['image']={}
+                    #annot['type']="Web page"
+                    
+                    
+                    annot['abstract']=''
+                    if lang=="lat":
+                        
+                      
+                        for dicts in token.xpath(".//dictionary"):
+                            dictName=dicts.xpath("./name")[0].text
+                            if dictName == "cooper": #choose liddle sctt
+                                ctsStr=[]
+                               
+                                try:
+                                    annot['abstract']=dicts.xpath("./entries/entry/content/sense/trans")[0].text
+                                except:
+                                
+                                    annot['abstract']=""    
+                                    
+                    elif lang=="ita":
+                        
+                      
+                        for dicts in token.xpath(".//dictionary"):
+                            dictName=dicts.xpath("./name")[0].text
+                            if dictName == "baretti": #choose liddle sctt
+                                ctsStr=[]
+                               
+                                try:
+                                    #annot['abstract']=""    
+                                    annot['abstract']=dicts.xpath("./entries/entry/content/i")[0].text
+                                except:
+                                
+                                    annot['abstract']=""    
+                    elif lang=="grc":
+                        
+                      
+                        for dicts in token.xpath(".//dictionary"):
+                            dictName=dicts.xpath("./name")[0].text
+                            if dictName == "lsj": #choose liddle sctt
+                                ctsStr=[]
+                               
+                                try:
+                                    #annot['abstract']=""    
+                                    annot['abstract']=dicts.xpath("./entries/entry/content/tr")[0].text
+                                except:
+                                
+                                    annot['abstract']=""    
+                    
+                    
+                   
+                    annot['lod']={"word":"http://purl.org/linguistics/gold/OrthographicWord"}
+                    annot['type']=["http://purl.org/linguistics/gold/OrthographicWord"]
+                    annot['types']=["http://purl.org/linguistics/gold/OrthographicWord"]
+                    annotations.append(annot)
+                    
+                   
+                   
+                except RuntimeError as err:
+                    print (err)
+                    pass
+                    
+            
+        cn = 0
+        cs = text
+        wps=[]
+        for an in annotations:
+            t=an['spot']
+            ps = cs.find(t)
+            wps.append((t,ps+cn))
+            an['start']=ps+cn
+            an['end']=ps+cn+len(t)
+        
+            cn=ps+cn+len(t)
+            cs=cs[ps+len(t):]
+           
+        
+        
+        #print (wps)
+        print ("Lenght wordlist:")
+        print (len(wps))
+        
+            
+        
+        
+                
+        ret['annotations']=annotations
+        ret['lang']=lang
+        
+        return JsonResponse(ret)
+   
+        #for token in dom.getroot().result.tokens:
+        #    print (token)
+        
+        
+        
+