Mercurial > hg > purlService

--- a/restService/restService.py	Thu Nov 15 17:23:31 2012 +0100
+++ b/restService/restService.py	Fri Nov 16 12:30:30 2012 +0100
@@ -8,13 +8,15 @@
 from redirector import redirector
 import logging
 from searcher import searcher
+from searchService.searchLines import searchLines

 urls = (
     '/purl/(.+)','purl',
     '/docuview/(.+)','redirector',
     '/search','searcher',
     '/indexMeta/(.+)','indexMeta',
-    '/','serviceDescription'
+    '/','serviceDescription',
+    '/searchLines','searchLines'
 )

 app = web.application(urls, globals())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchService/searchLines.py	Fri Nov 16 12:30:30 2012 +0100
@@ -0,0 +1,188 @@
+'''
+Created on 16.11.2012
+
+@author: dwinter
+'''
+
+import solr
+import web
+import urllib
+import os.path
+
+SOLR_SERVER="http://localhost:8983/solr"
+DRI_SERVER="http://localhost:8080/purl/"
+
+class searchLines:
+
+    def __init__(self):
+        self.con = solr.SolrConnection(SOLR_SERVER)
+
+        self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
+        self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
+
+    def GET(self):
+        paras = web.input()
+
+        if len(paras.keys())==0:
+            raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page")
+
+
+        queryString=paras.get("query")
+        if not queryString:
+            queryString="q="
+
+        docPath = paras.get("doc")
+        if docPath:
+            #make sure that docpath="/mpiwg/online/..."
+            if not docPath.startswith("/mpiwg/"):
+                if not docPath.startswith("/mpiwg/online/"):
+                    docPath="/mpiwg/online/"+docPath
+                else:
+                    docPath="/mpiwg/"+docPath
+
+
+            #makesure no index.meta at the end and no /
+
+            docPath=docPath.replace("/index.meta","")
+            if docPath[-1]=="/":
+                docPath=docPath[0:-1]
+
+            docPath=os.path.normpath(docPath)
+            queryString+=""" archive-path-folder:"%s" """%docPath
+
+
+
+        dri = paras.get('dri')
+        if dri:
+            f = urllib.urlopen(DRI_SERVER+dri)
+
+            indexMeta=f.read()
+
+
+            if f.getcode()==404:
+                raise web.badrequest("DRI: %s not existing"%dri)
+
+
+            if indexMeta and not indexMeta.startswith("/mpiwg/"):
+                if not indexMeta.startswith("/mpiwg/online/"):
+                    indexMeta="/mpiwg/online/"+indexMeta
+                else:
+                    indexMeta="/mpiwg/"+indexMeta
+
+            indexMeta=os.path.normpath(indexMeta)
+            #makesure no index.meta at the end and no /
+
+
+
+
+
+
+            queryString+=' archive-path-indexMeta:"%s"'%indexMeta
+
+
+
+        page= paras.get("pf")
+        if page:
+            # im verzeichnis steht nur der seiten name nicht der pfad daher nur  das ende falls "pageimg/xxx" ubergeben wird
+
+            head,name=os.path.split(page)
+
+            name,ext = os.path.splitext(name)
+            splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist.
+
+            if len(splitted)>1:
+                name=".".join(splitted[0,-1])
+
+            queryString+=" pf:%s"%name
+
+
+
+        response = self.search(queryString,wt="json")
+
+        ret=""
+        hitId=0
+        rows=[]
+
+        pageSize=self.getPageSize(queryString)
+        for hit in response:
+
+            rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
+
+            hitId+=1
+            print hit
+
+
+
+
+
+
+        return rows
+
+    def generateRowForJson(self,hitID,hit,query,pageSize):
+
+        ret={}
+        ret["id"]=str(hitID)
+        ret["text"]=query
+
+
+        splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
+
+
+        try:
+                x =(int(splitted[1]),int(splitted[2]))
+                y =(int(splitted[3]),int(splitted[4]))
+
+        except:
+                return ret
+
+
+
+
+        x,y = self.calculateRelBoundingBox(x, y, pageSize)
+
+        ret["areas"]=[self.generateAreaForJson(x, y)]
+
+        return ret
+
+    def generateAreaForJson(self,x,y):
+
+        area={}
+        area["height"]=str(y[0]-x[0])
+        area["width"]=str(y[1]-x[1])
+        area["y"]=str(x[1])
+        area["x"]=str(x[0])
+
+        return area
+
+    def calculateRelBoundingBox(self,x,y,pageSize):
+
+        xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
+        yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
+
+        return xneu,yneu
+
+    def getPageSize(self,queryString):
+        print queryString
+
+        response = self.searchText(queryString,wt="json")
+
+        for hit in response: #sollte eigentlich nur einen geben
+            ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
+
+            splitted=ocrPage.split(" ")
+
+            try:
+                x1=int(splitted[0])
+                x2=int(splitted[1])
+                y1=int(splitted[2])
+                y2=int(splitted[3])
+
+            except:
+                return 0,0
+
+            return y1-x1,y2-x2
+
+        return 0,0
+
+
+