Mercurial > hg > purlService
view searchService/searchLines.py @ 19:cce127a28fc9
added getpurls
author | dwinter |
---|---|
date | Wed, 21 Nov 2012 15:39:08 +0100 |
parents | 1eb5e3f6444b |
children |
line wrap: on
line source
''' Created on 16.11.2012 @author: dwinter ''' import solr import web import urllib import os.path import json import urllib2 import logging SOLR_SERVER="https://md.mpiwg-berlin.mpg.de/solr" DRI_SERVER="http://md.mpiwg-berlin.mpg.de/" class searchLines: def __init__(self): #logging.basicConfig(filename='/tmp/solr.log',level=logging.DEBUG) self.con = solr.SolrConnection(SOLR_SERVER,debug=False) self.search = solr.SearchHandler(self.con,"/fulltexts-line/select") self.searchText = solr.SearchHandler(self.con,"/fulltexts/select") def GET(self): paras = web.input() if len(paras.keys())==0: raise web.badrequest("Needs at minimum one of the parameters: query,doc,dri,page") queryString=paras.get("query") queryString=urllib2.unquote(queryString) if not queryString: queryString="q=" docPath = paras.get("uri") if docPath: docPath=urllib2.unquote(docPath) #make sure that docpath="/mpiwg/online/..." if not docPath.startswith("/mpiwg/"): if not docPath.startswith("/mpiwg/online/"): docPath="/mpiwg/online/"+docPath else: docPath="/mpiwg/"+docPath #makesure no index.meta at the end and no / docPath=docPath.replace("/index.meta","") if docPath[-1]=="/": docPath=docPath[0:-1] docPath=os.path.normpath(docPath) queryString+=""" AND archive-path-folder:"%s" """%docPath dri = paras.get('dri') if dri: f = urllib.urlopen(DRI_SERVER+dri) indexMeta=f.read() if f.getcode()==404: raise web.badrequest("DRI: %s not existing"%dri) if indexMeta and not indexMeta.startswith("/mpiwg/"): if not indexMeta.startswith("/mpiwg/online/"): indexMeta="/mpiwg/online/"+indexMeta else: indexMeta="/mpiwg/"+indexMeta indexMeta=os.path.normpath(indexMeta) #makesure no index.meta at the end and no / queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta page= paras.get("pf") if page: # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird head,name=os.path.split(page) name,ext = os.path.splitext(name) queryString+=" AND pf:%s"%name response = self.search(queryString) ret="" hitId=0 rows=[] pageSize=self.getPageSize(queryString) for hit in response: rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize)) hitId+=1 returnJSON={} returnJSON['rows']=rows returnJSON['total']=len(rows) web.header('Content-Type', 'application/json') web.header('Access-Control-Allow-Origin', '*') web.header('Access-Control-Allow-Credentials', 'true') return json.dumps(returnJSON) def generateRowForJson(self,hitID,hit,query,pageSize): ret={} ret["id"]=str(hitID) ret["text"]=query splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, try: x =(int(splitted[1]),int(splitted[2])) y =(int(splitted[3]),int(splitted[4])) except: return ret x,y = self.calculateRelBoundingBox(x, y, pageSize) ret["areas"]=[self.generateAreaForJson(x, y)] return ret def generateAreaForJson(self,x,y): area={} area["width"]=str(y[0]-x[0]) area["height"]=str(y[1]-x[1]) area["y"]=str(x[1]) area["x"]=str(x[0]) return area def calculateRelBoundingBox(self,x,y,pageSize): xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1]) yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1]) return xneu,yneu def getPageSize(self,queryString): response = self.searchText(queryString,wt="json") for hit in response: #sollte eigentlich nur einen geben ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, splitted=ocrPage.split(" ") try: x1=int(splitted[0]) x2=int(splitted[1]) y1=int(splitted[2]) y2=int(splitted[3]) except: return 0,0 return y1-x1,y2-x2 return 0,0