view searchService/searchLines.py @ 16:70110fb915a9

searchlines
author dwinter
date Fri, 16 Nov 2012 12:30:30 +0100
parents
children 64d6ac1a1354
line wrap: on
line source

'''
Created on 16.11.2012

@author: dwinter
'''

import solr
import web
import urllib
import os.path

SOLR_SERVER="http://localhost:8983/solr"
DRI_SERVER="http://localhost:8080/purl/"

class searchLines:
    
    def __init__(self):
        self.con = solr.SolrConnection(SOLR_SERVER)
        
        self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
        self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
    
    def GET(self):
        paras = web.input()
        
        if len(paras.keys())==0:
            raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page")
        
        
        queryString=paras.get("query")
        if not queryString:
            queryString="q=" 
        
        docPath = paras.get("doc")
        if docPath:
            #make sure that docpath="/mpiwg/online/..."
            if not docPath.startswith("/mpiwg/"):
                if not docPath.startswith("/mpiwg/online/"):
                    docPath="/mpiwg/online/"+docPath
                else:
                    docPath="/mpiwg/"+docPath
            
            
            #makesure no index.meta at the end and no /
            
            docPath=docPath.replace("/index.meta","")
            if docPath[-1]=="/":
                docPath=docPath[0:-1]
                
            docPath=os.path.normpath(docPath)
            queryString+=""" archive-path-folder:"%s" """%docPath
            
            
        
        dri = paras.get('dri')
        if dri:
            f = urllib.urlopen(DRI_SERVER+dri)
            
            indexMeta=f.read()
            
            
            if f.getcode()==404:
                raise web.badrequest("DRI: %s not existing"%dri)            
            
            
            if indexMeta and not indexMeta.startswith("/mpiwg/"):
                if not indexMeta.startswith("/mpiwg/online/"):
                    indexMeta="/mpiwg/online/"+indexMeta
                else:
                    indexMeta="/mpiwg/"+indexMeta
            
            indexMeta=os.path.normpath(indexMeta)
            #makesure no index.meta at the end and no /
            
             
             
            
            
            
            queryString+=' archive-path-indexMeta:"%s"'%indexMeta
            
            
            
        page= paras.get("pf")
        if page:
            # im verzeichnis steht nur der seiten name nicht der pfad daher nur  das ende falls "pageimg/xxx" ubergeben wird
            
            head,name=os.path.split(page)
            
            name,ext = os.path.splitext(name)
            splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist.
            
            if len(splitted)>1:
                name=".".join(splitted[0,-1]) 
            
            queryString+=" pf:%s"%name
             
        
        
        response = self.search(queryString,wt="json")
        
        ret=""
        hitId=0
        rows=[]
        
        pageSize=self.getPageSize(queryString)
        for hit in response:
            
            rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
            
            hitId+=1
            print hit
        
        
        
        
        
        
        return rows
    
    def generateRowForJson(self,hitID,hit,query,pageSize):
        
        ret={}
        ret["id"]=str(hitID)
        ret["text"]=query
        
        
        splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
        
        
        try:
                x =(int(splitted[1]),int(splitted[2]))
                y =(int(splitted[3]),int(splitted[4]))
                
        except:
                return ret
        
                
        
        
        x,y = self.calculateRelBoundingBox(x, y, pageSize)
         
        ret["areas"]=[self.generateAreaForJson(x, y)]
        
        return ret
    
    def generateAreaForJson(self,x,y):
        
        area={}
        area["height"]=str(y[0]-x[0])
        area["width"]=str(y[1]-x[1])
        area["y"]=str(x[1])
        area["x"]=str(x[0])
        
        return area
        
    def calculateRelBoundingBox(self,x,y,pageSize):
        
        xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
        yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
        
        return xneu,yneu
        
    def getPageSize(self,queryString):
        print queryString
        
        response = self.searchText(queryString,wt="json")
        
        for hit in response: #sollte eigentlich nur einen geben
            ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
            
            splitted=ocrPage.split(" ")
            
            try:
                x1=int(splitted[0])
                x2=int(splitted[1])
                y1=int(splitted[2])
                y2=int(splitted[3])
                
            except:
                return 0,0
                 
            return y1-x1,y2-x2
            
        return 0,0