Mercurial > hg > purlService
view searchService/searchLines.py @ 16:70110fb915a9
searchlines
author | dwinter |
---|---|
date | Fri, 16 Nov 2012 12:30:30 +0100 |
parents | |
children | 64d6ac1a1354 |
line wrap: on
line source
''' Created on 16.11.2012 @author: dwinter ''' import solr import web import urllib import os.path SOLR_SERVER="http://localhost:8983/solr" DRI_SERVER="http://localhost:8080/purl/" class searchLines: def __init__(self): self.con = solr.SolrConnection(SOLR_SERVER) self.search = solr.SearchHandler(self.con,"/fulltexts-line/select") self.searchText = solr.SearchHandler(self.con,"/fulltexts/select") def GET(self): paras = web.input() if len(paras.keys())==0: raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page") queryString=paras.get("query") if not queryString: queryString="q=" docPath = paras.get("doc") if docPath: #make sure that docpath="/mpiwg/online/..." if not docPath.startswith("/mpiwg/"): if not docPath.startswith("/mpiwg/online/"): docPath="/mpiwg/online/"+docPath else: docPath="/mpiwg/"+docPath #makesure no index.meta at the end and no / docPath=docPath.replace("/index.meta","") if docPath[-1]=="/": docPath=docPath[0:-1] docPath=os.path.normpath(docPath) queryString+=""" archive-path-folder:"%s" """%docPath dri = paras.get('dri') if dri: f = urllib.urlopen(DRI_SERVER+dri) indexMeta=f.read() if f.getcode()==404: raise web.badrequest("DRI: %s not existing"%dri) if indexMeta and not indexMeta.startswith("/mpiwg/"): if not indexMeta.startswith("/mpiwg/online/"): indexMeta="/mpiwg/online/"+indexMeta else: indexMeta="/mpiwg/"+indexMeta indexMeta=os.path.normpath(indexMeta) #makesure no index.meta at the end and no / queryString+=' archive-path-indexMeta:"%s"'%indexMeta page= paras.get("pf") if page: # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird head,name=os.path.split(page) name,ext = os.path.splitext(name) splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist. if len(splitted)>1: name=".".join(splitted[0,-1]) queryString+=" pf:%s"%name response = self.search(queryString,wt="json") ret="" hitId=0 rows=[] pageSize=self.getPageSize(queryString) for hit in response: rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize)) hitId+=1 print hit return rows def generateRowForJson(self,hitID,hit,query,pageSize): ret={} ret["id"]=str(hitID) ret["text"]=query splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, try: x =(int(splitted[1]),int(splitted[2])) y =(int(splitted[3]),int(splitted[4])) except: return ret x,y = self.calculateRelBoundingBox(x, y, pageSize) ret["areas"]=[self.generateAreaForJson(x, y)] return ret def generateAreaForJson(self,x,y): area={} area["height"]=str(y[0]-x[0]) area["width"]=str(y[1]-x[1]) area["y"]=str(x[1]) area["x"]=str(x[0]) return area def calculateRelBoundingBox(self,x,y,pageSize): xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1]) yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1]) return xneu,yneu def getPageSize(self,queryString): print queryString response = self.searchText(queryString,wt="json") for hit in response: #sollte eigentlich nur einen geben ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, splitted=ocrPage.split(" ") try: x1=int(splitted[0]) x2=int(splitted[1]) y1=int(splitted[2]) y2=int(splitted[3]) except: return 0,0 return y1-x1,y2-x2 return 0,0