Mercurial > hg > purlService
changeset 16:70110fb915a9
searchlines
author | dwinter |
---|---|
date | Fri, 16 Nov 2012 12:30:30 +0100 |
parents | 5bab6e95980e |
children | 64d6ac1a1354 |
files | restService/restService.py searchService/__init__.py searchService/searchLines.py |
diffstat | 2 files changed, 191 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/restService/restService.py Thu Nov 15 17:23:31 2012 +0100 +++ b/restService/restService.py Fri Nov 16 12:30:30 2012 +0100 @@ -8,13 +8,15 @@ from redirector import redirector import logging from searcher import searcher +from searchService.searchLines import searchLines urls = ( '/purl/(.+)','purl', '/docuview/(.+)','redirector', '/search','searcher', '/indexMeta/(.+)','indexMeta', - '/','serviceDescription' + '/','serviceDescription', + '/searchLines','searchLines' ) app = web.application(urls, globals())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchService/searchLines.py Fri Nov 16 12:30:30 2012 +0100 @@ -0,0 +1,188 @@ +''' +Created on 16.11.2012 + +@author: dwinter +''' + +import solr +import web +import urllib +import os.path + +SOLR_SERVER="http://localhost:8983/solr" +DRI_SERVER="http://localhost:8080/purl/" + +class searchLines: + + def __init__(self): + self.con = solr.SolrConnection(SOLR_SERVER) + + self.search = solr.SearchHandler(self.con,"/fulltexts-line/select") + self.searchText = solr.SearchHandler(self.con,"/fulltexts/select") + + def GET(self): + paras = web.input() + + if len(paras.keys())==0: + raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page") + + + queryString=paras.get("query") + if not queryString: + queryString="q=" + + docPath = paras.get("doc") + if docPath: + #make sure that docpath="/mpiwg/online/..." + if not docPath.startswith("/mpiwg/"): + if not docPath.startswith("/mpiwg/online/"): + docPath="/mpiwg/online/"+docPath + else: + docPath="/mpiwg/"+docPath + + + #makesure no index.meta at the end and no / + + docPath=docPath.replace("/index.meta","") + if docPath[-1]=="/": + docPath=docPath[0:-1] + + docPath=os.path.normpath(docPath) + queryString+=""" archive-path-folder:"%s" """%docPath + + + + dri = paras.get('dri') + if dri: + f = urllib.urlopen(DRI_SERVER+dri) + + indexMeta=f.read() + + + if f.getcode()==404: + raise web.badrequest("DRI: %s not existing"%dri) + + + if indexMeta and not indexMeta.startswith("/mpiwg/"): + if not indexMeta.startswith("/mpiwg/online/"): + indexMeta="/mpiwg/online/"+indexMeta + else: + indexMeta="/mpiwg/"+indexMeta + + indexMeta=os.path.normpath(indexMeta) + #makesure no index.meta at the end and no / + + + + + + + queryString+=' archive-path-indexMeta:"%s"'%indexMeta + + + + page= paras.get("pf") + if page: + # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird + + head,name=os.path.split(page) + + name,ext = os.path.splitext(name) + splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist. + + if len(splitted)>1: + name=".".join(splitted[0,-1]) + + queryString+=" pf:%s"%name + + + + response = self.search(queryString,wt="json") + + ret="" + hitId=0 + rows=[] + + pageSize=self.getPageSize(queryString) + for hit in response: + + rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize)) + + hitId+=1 + print hit + + + + + + + return rows + + def generateRowForJson(self,hitID,hit,query,pageSize): + + ret={} + ret["id"]=str(hitID) + ret["text"]=query + + + splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, + + + try: + x =(int(splitted[1]),int(splitted[2])) + y =(int(splitted[3]),int(splitted[4])) + + except: + return ret + + + + + x,y = self.calculateRelBoundingBox(x, y, pageSize) + + ret["areas"]=[self.generateAreaForJson(x, y)] + + return ret + + def generateAreaForJson(self,x,y): + + area={} + area["height"]=str(y[0]-x[0]) + area["width"]=str(y[1]-x[1]) + area["y"]=str(x[1]) + area["x"]=str(x[0]) + + return area + + def calculateRelBoundingBox(self,x,y,pageSize): + + xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1]) + yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1]) + + return xneu,yneu + + def getPageSize(self,queryString): + print queryString + + response = self.searchText(queryString,wt="json") + + for hit in response: #sollte eigentlich nur einen geben + ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke, + + splitted=ocrPage.split(" ") + + try: + x1=int(splitted[0]) + x2=int(splitted[1]) + y1=int(splitted[2]) + y2=int(splitted[3]) + + except: + return 0,0 + + return y1-x1,y2-x2 + + return 0,0 + + +