changeset 16:70110fb915a9

searchlines
author dwinter
date Fri, 16 Nov 2012 12:30:30 +0100
parents 5bab6e95980e
children 64d6ac1a1354
files restService/restService.py searchService/__init__.py searchService/searchLines.py
diffstat 2 files changed, 191 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/restService/restService.py	Thu Nov 15 17:23:31 2012 +0100
+++ b/restService/restService.py	Fri Nov 16 12:30:30 2012 +0100
@@ -8,13 +8,15 @@
 from redirector import redirector 
 import logging
 from searcher import searcher
+from searchService.searchLines import searchLines
 
 urls = (
     '/purl/(.+)','purl',
     '/docuview/(.+)','redirector',
     '/search','searcher',
     '/indexMeta/(.+)','indexMeta',
-    '/','serviceDescription'
+    '/','serviceDescription',
+    '/searchLines','searchLines'
 )
 
 app = web.application(urls, globals())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchService/searchLines.py	Fri Nov 16 12:30:30 2012 +0100
@@ -0,0 +1,188 @@
+'''
+Created on 16.11.2012
+
+@author: dwinter
+'''
+
+import solr
+import web
+import urllib
+import os.path
+
+SOLR_SERVER="http://localhost:8983/solr"
+DRI_SERVER="http://localhost:8080/purl/"
+
+class searchLines:
+    
+    def __init__(self):
+        self.con = solr.SolrConnection(SOLR_SERVER)
+        
+        self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
+        self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
+    
+    def GET(self):
+        paras = web.input()
+        
+        if len(paras.keys())==0:
+            raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page")
+        
+        
+        queryString=paras.get("query")
+        if not queryString:
+            queryString="q=" 
+        
+        docPath = paras.get("doc")
+        if docPath:
+            #make sure that docpath="/mpiwg/online/..."
+            if not docPath.startswith("/mpiwg/"):
+                if not docPath.startswith("/mpiwg/online/"):
+                    docPath="/mpiwg/online/"+docPath
+                else:
+                    docPath="/mpiwg/"+docPath
+            
+            
+            #makesure no index.meta at the end and no /
+            
+            docPath=docPath.replace("/index.meta","")
+            if docPath[-1]=="/":
+                docPath=docPath[0:-1]
+                
+            docPath=os.path.normpath(docPath)
+            queryString+=""" archive-path-folder:"%s" """%docPath
+            
+            
+        
+        dri = paras.get('dri')
+        if dri:
+            f = urllib.urlopen(DRI_SERVER+dri)
+            
+            indexMeta=f.read()
+            
+            
+            if f.getcode()==404:
+                raise web.badrequest("DRI: %s not existing"%dri)            
+            
+            
+            if indexMeta and not indexMeta.startswith("/mpiwg/"):
+                if not indexMeta.startswith("/mpiwg/online/"):
+                    indexMeta="/mpiwg/online/"+indexMeta
+                else:
+                    indexMeta="/mpiwg/"+indexMeta
+            
+            indexMeta=os.path.normpath(indexMeta)
+            #makesure no index.meta at the end and no /
+            
+             
+             
+            
+            
+            
+            queryString+=' archive-path-indexMeta:"%s"'%indexMeta
+            
+            
+            
+        page= paras.get("pf")
+        if page:
+            # im verzeichnis steht nur der seiten name nicht der pfad daher nur  das ende falls "pageimg/xxx" ubergeben wird
+            
+            head,name=os.path.split(page)
+            
+            name,ext = os.path.splitext(name)
+            splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist.
+            
+            if len(splitted)>1:
+                name=".".join(splitted[0,-1]) 
+            
+            queryString+=" pf:%s"%name
+             
+        
+        
+        response = self.search(queryString,wt="json")
+        
+        ret=""
+        hitId=0
+        rows=[]
+        
+        pageSize=self.getPageSize(queryString)
+        for hit in response:
+            
+            rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
+            
+            hitId+=1
+            print hit
+        
+        
+        
+        
+        
+        
+        return rows
+    
+    def generateRowForJson(self,hitID,hit,query,pageSize):
+        
+        ret={}
+        ret["id"]=str(hitID)
+        ret["text"]=query
+        
+        
+        splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
+        
+        
+        try:
+                x =(int(splitted[1]),int(splitted[2]))
+                y =(int(splitted[3]),int(splitted[4]))
+                
+        except:
+                return ret
+        
+                
+        
+        
+        x,y = self.calculateRelBoundingBox(x, y, pageSize)
+         
+        ret["areas"]=[self.generateAreaForJson(x, y)]
+        
+        return ret
+    
+    def generateAreaForJson(self,x,y):
+        
+        area={}
+        area["height"]=str(y[0]-x[0])
+        area["width"]=str(y[1]-x[1])
+        area["y"]=str(x[1])
+        area["x"]=str(x[0])
+        
+        return area
+        
+    def calculateRelBoundingBox(self,x,y,pageSize):
+        
+        xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
+        yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
+        
+        return xneu,yneu
+        
+    def getPageSize(self,queryString):
+        print queryString
+        
+        response = self.searchText(queryString,wt="json")
+        
+        for hit in response: #sollte eigentlich nur einen geben
+            ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
+            
+            splitted=ocrPage.split(" ")
+            
+            try:
+                x1=int(splitted[0])
+                x2=int(splitted[1])
+                y1=int(splitted[2])
+                y2=int(splitted[3])
+                
+            except:
+                return 0,0
+                 
+            return y1-x1,y2-x2
+            
+        return 0,0
+        
+        
+