Mercurial > hg > purlService

'''
Created on 16.11.2012

@author: dwinter
'''

import solr
import web
import urllib
import os.path
import json
import urllib2
import logging

SOLR_SERVER="https://md.mpiwg-berlin.mpg.de/solr"
DRI_SERVER="http://md.mpiwg-berlin.mpg.de/"

class searchLines:

    def __init__(self):
        #logging.basicConfig(filename='/tmp/solr.log',level=logging.DEBUG)
        self.con = solr.SolrConnection(SOLR_SERVER,debug=False)

        self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
        self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")

    def GET(self):
        paras = web.input()

        if len(paras.keys())==0:
            raise web.badrequest("Needs at minimum one of the parameters: query,doc,dri,page")


        queryString=paras.get("query")

        queryString=urllib2.unquote(queryString)


        if not queryString:
            queryString="q="


        docPath = paras.get("uri")
        if docPath:
            docPath=urllib2.unquote(docPath)
            #make sure that docpath="/mpiwg/online/..."
            if not docPath.startswith("/mpiwg/"):
                if not docPath.startswith("/mpiwg/online/"):
                    docPath="/mpiwg/online/"+docPath
                else:
                    docPath="/mpiwg/"+docPath


            #makesure no index.meta at the end and no /

            docPath=docPath.replace("/index.meta","")
            if docPath[-1]=="/":
                docPath=docPath[0:-1]

            docPath=os.path.normpath(docPath)

            queryString+=""" AND archive-path-folder:"%s" """%docPath


        dri = paras.get('dri')
        if dri:
            f = urllib.urlopen(DRI_SERVER+dri)

            indexMeta=f.read()


            if f.getcode()==404:
                raise web.badrequest("DRI: %s not existing"%dri)


            if indexMeta and not indexMeta.startswith("/mpiwg/"):
                if not indexMeta.startswith("/mpiwg/online/"):
                    indexMeta="/mpiwg/online/"+indexMeta
                else:
                    indexMeta="/mpiwg/"+indexMeta

            indexMeta=os.path.normpath(indexMeta)
            #makesure no index.meta at the end and no /


            queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta


        page= paras.get("pf")
        if page:
            # im verzeichnis steht nur der seiten name nicht der pfad daher nur  das ende falls "pageimg/xxx" ubergeben wird

            head,name=os.path.split(page)

            name,ext = os.path.splitext(name)

            queryString+=" AND pf:%s"%name


        response = self.search(queryString)

        ret=""
        hitId=0
        rows=[]

        pageSize=self.getPageSize(queryString)
        for hit in response:

            rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))

            hitId+=1


        returnJSON={}

        returnJSON['rows']=rows
        returnJSON['total']=len(rows)

        web.header('Content-Type', 'application/json')
        web.header('Access-Control-Allow-Origin', '*')
        web.header('Access-Control-Allow-Credentials', 'true')


        return json.dumps(returnJSON)


    def generateRowForJson(self,hitID,hit,query,pageSize):

        ret={}
        ret["id"]=str(hitID)
        ret["text"]=query


        splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,


        try:
                x =(int(splitted[1]),int(splitted[2]))
                y =(int(splitted[3]),int(splitted[4]))

        except:
                return ret


        x,y = self.calculateRelBoundingBox(x, y, pageSize)

        ret["areas"]=[self.generateAreaForJson(x, y)]

        return ret

    def generateAreaForJson(self,x,y):

        area={}
        area["width"]=str(y[0]-x[0])
        area["height"]=str(y[1]-x[1])
        area["y"]=str(x[1])
        area["x"]=str(x[0])

        return area

    def calculateRelBoundingBox(self,x,y,pageSize):

        xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
        yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])

        return xneu,yneu

    def getPageSize(self,queryString):

        response = self.searchText(queryString,wt="json")

        for hit in response: #sollte eigentlich nur einen geben
            ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,

            splitted=ocrPage.split(" ")

            try:
                x1=int(splitted[0])
                x2=int(splitted[1])
                y1=int(splitted[2])
                y2=int(splitted[3])

            except:
                return 0,0

            return y1-x1,y2-x2

        return 0,0
author	dwinter
date	Wed, 21 Nov 2012 15:39:08 +0100
parents	1eb5e3f6444b
children