view searchService/searchLines.py @ 19:cce127a28fc9

added getpurls
author dwinter
date Wed, 21 Nov 2012 15:39:08 +0100
parents 1eb5e3f6444b
children
line wrap: on
line source

'''
Created on 16.11.2012

@author: dwinter
'''

import solr
import web
import urllib
import os.path
import json
import urllib2
import logging

SOLR_SERVER="https://md.mpiwg-berlin.mpg.de/solr"
DRI_SERVER="http://md.mpiwg-berlin.mpg.de/"

class searchLines:
    
    def __init__(self):
        #logging.basicConfig(filename='/tmp/solr.log',level=logging.DEBUG)
        self.con = solr.SolrConnection(SOLR_SERVER,debug=False)
        
        self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
        self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
    
    def GET(self):
        paras = web.input()
        
        if len(paras.keys())==0:
            raise web.badrequest("Needs at minimum one of the parameters: query,doc,dri,page")
        
        
        queryString=paras.get("query")

        queryString=urllib2.unquote(queryString)


        if not queryString:
            queryString="q=" 
        

        docPath = paras.get("uri")
        if docPath:
            docPath=urllib2.unquote(docPath)
            #make sure that docpath="/mpiwg/online/..."
            if not docPath.startswith("/mpiwg/"):
                if not docPath.startswith("/mpiwg/online/"):
                    docPath="/mpiwg/online/"+docPath
                else:
                    docPath="/mpiwg/"+docPath
            
            
            #makesure no index.meta at the end and no /
            
            docPath=docPath.replace("/index.meta","")
            if docPath[-1]=="/":
                docPath=docPath[0:-1]
                
            docPath=os.path.normpath(docPath)

            queryString+=""" AND archive-path-folder:"%s" """%docPath
            

        
        dri = paras.get('dri')
        if dri:
            f = urllib.urlopen(DRI_SERVER+dri)
            
            indexMeta=f.read()
            
            
            if f.getcode()==404:
                raise web.badrequest("DRI: %s not existing"%dri)            
            
            
            if indexMeta and not indexMeta.startswith("/mpiwg/"):
                if not indexMeta.startswith("/mpiwg/online/"):
                    indexMeta="/mpiwg/online/"+indexMeta
                else:
                    indexMeta="/mpiwg/"+indexMeta
            
            indexMeta=os.path.normpath(indexMeta)
            #makesure no index.meta at the end and no /
            
             
             
            
            
            
            queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta
            

            
        page= paras.get("pf")
        if page:
            # im verzeichnis steht nur der seiten name nicht der pfad daher nur  das ende falls "pageimg/xxx" ubergeben wird
            
            head,name=os.path.split(page)
            
            name,ext = os.path.splitext(name)
            
            queryString+=" AND pf:%s"%name
             
        

        response = self.search(queryString)
        
        ret=""
        hitId=0
        rows=[]

        pageSize=self.getPageSize(queryString)
        for hit in response:
            
            rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
            
            hitId+=1
         
        
        
        
        
        
        returnJSON={}
        
        returnJSON['rows']=rows
        returnJSON['total']=len(rows)
        
        web.header('Content-Type', 'application/json')
        web.header('Access-Control-Allow-Origin', '*')
        web.header('Access-Control-Allow-Credentials', 'true')
        
        
        return json.dumps(returnJSON)


        
    def generateRowForJson(self,hitID,hit,query,pageSize):
        
        ret={}
        ret["id"]=str(hitID)
        ret["text"]=query
        
        
        splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
        
        
        try:
                x =(int(splitted[1]),int(splitted[2]))
                y =(int(splitted[3]),int(splitted[4]))
                
        except:
                return ret
        
                
        
        
        x,y = self.calculateRelBoundingBox(x, y, pageSize)
         
        ret["areas"]=[self.generateAreaForJson(x, y)]
        
        return ret
    
    def generateAreaForJson(self,x,y):
        
        area={}
        area["width"]=str(y[0]-x[0])
        area["height"]=str(y[1]-x[1])
        area["y"]=str(x[1])
        area["x"]=str(x[0])
        
        return area
        
    def calculateRelBoundingBox(self,x,y,pageSize):
        
        xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
        yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
        
        return xneu,yneu
        
    def getPageSize(self,queryString):
      
        response = self.searchText(queryString,wt="json")
        
        for hit in response: #sollte eigentlich nur einen geben
            ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
            
            splitted=ocrPage.split(" ")
            
            try:
                x1=int(splitted[0])
                x2=int(splitted[1])
                y1=int(splitted[2])
                y2=int(splitted[3])
                
            except:
                return 0,0
                 
            return y1-x1,y2-x2
            
        return 0,0