annotate searchService/searchLines.py @ 17:64d6ac1a1354

parameter for search changed
author dwinter
date Fri, 16 Nov 2012 15:26:05 +0100
parents 70110fb915a9
children 1eb5e3f6444b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
1 '''
70110fb915a9 searchlines
dwinter
parents:
diff changeset
2 Created on 16.11.2012
70110fb915a9 searchlines
dwinter
parents:
diff changeset
3
70110fb915a9 searchlines
dwinter
parents:
diff changeset
4 @author: dwinter
70110fb915a9 searchlines
dwinter
parents:
diff changeset
5 '''
70110fb915a9 searchlines
dwinter
parents:
diff changeset
6
70110fb915a9 searchlines
dwinter
parents:
diff changeset
7 import solr
70110fb915a9 searchlines
dwinter
parents:
diff changeset
8 import web
70110fb915a9 searchlines
dwinter
parents:
diff changeset
9 import urllib
70110fb915a9 searchlines
dwinter
parents:
diff changeset
10 import os.path
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
11 import json
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
12
70110fb915a9 searchlines
dwinter
parents:
diff changeset
13 SOLR_SERVER="http://localhost:8983/solr"
70110fb915a9 searchlines
dwinter
parents:
diff changeset
14 DRI_SERVER="http://localhost:8080/purl/"
70110fb915a9 searchlines
dwinter
parents:
diff changeset
15
70110fb915a9 searchlines
dwinter
parents:
diff changeset
16 class searchLines:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
17
70110fb915a9 searchlines
dwinter
parents:
diff changeset
18 def __init__(self):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
19 self.con = solr.SolrConnection(SOLR_SERVER)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
20
70110fb915a9 searchlines
dwinter
parents:
diff changeset
21 self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
22 self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
23
70110fb915a9 searchlines
dwinter
parents:
diff changeset
24 def GET(self):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
25 paras = web.input()
70110fb915a9 searchlines
dwinter
parents:
diff changeset
26
70110fb915a9 searchlines
dwinter
parents:
diff changeset
27 if len(paras.keys())==0:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
28 raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
29
70110fb915a9 searchlines
dwinter
parents:
diff changeset
30
70110fb915a9 searchlines
dwinter
parents:
diff changeset
31 queryString=paras.get("query")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
32 if not queryString:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
33 queryString="q="
70110fb915a9 searchlines
dwinter
parents:
diff changeset
34
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
35 docPath = paras.get("uri")
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
36 if docPath:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
37 #make sure that docpath="/mpiwg/online/..."
70110fb915a9 searchlines
dwinter
parents:
diff changeset
38 if not docPath.startswith("/mpiwg/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
39 if not docPath.startswith("/mpiwg/online/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
40 docPath="/mpiwg/online/"+docPath
70110fb915a9 searchlines
dwinter
parents:
diff changeset
41 else:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
42 docPath="/mpiwg/"+docPath
70110fb915a9 searchlines
dwinter
parents:
diff changeset
43
70110fb915a9 searchlines
dwinter
parents:
diff changeset
44
70110fb915a9 searchlines
dwinter
parents:
diff changeset
45 #makesure no index.meta at the end and no /
70110fb915a9 searchlines
dwinter
parents:
diff changeset
46
70110fb915a9 searchlines
dwinter
parents:
diff changeset
47 docPath=docPath.replace("/index.meta","")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
48 if docPath[-1]=="/":
70110fb915a9 searchlines
dwinter
parents:
diff changeset
49 docPath=docPath[0:-1]
70110fb915a9 searchlines
dwinter
parents:
diff changeset
50
70110fb915a9 searchlines
dwinter
parents:
diff changeset
51 docPath=os.path.normpath(docPath)
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
52 queryString+=""" AND archive-path-folder:"%s" """%docPath
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
53
70110fb915a9 searchlines
dwinter
parents:
diff changeset
54
70110fb915a9 searchlines
dwinter
parents:
diff changeset
55
70110fb915a9 searchlines
dwinter
parents:
diff changeset
56 dri = paras.get('dri')
70110fb915a9 searchlines
dwinter
parents:
diff changeset
57 if dri:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
58 f = urllib.urlopen(DRI_SERVER+dri)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
59
70110fb915a9 searchlines
dwinter
parents:
diff changeset
60 indexMeta=f.read()
70110fb915a9 searchlines
dwinter
parents:
diff changeset
61
70110fb915a9 searchlines
dwinter
parents:
diff changeset
62
70110fb915a9 searchlines
dwinter
parents:
diff changeset
63 if f.getcode()==404:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
64 raise web.badrequest("DRI: %s not existing"%dri)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
65
70110fb915a9 searchlines
dwinter
parents:
diff changeset
66
70110fb915a9 searchlines
dwinter
parents:
diff changeset
67 if indexMeta and not indexMeta.startswith("/mpiwg/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
68 if not indexMeta.startswith("/mpiwg/online/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
69 indexMeta="/mpiwg/online/"+indexMeta
70110fb915a9 searchlines
dwinter
parents:
diff changeset
70 else:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
71 indexMeta="/mpiwg/"+indexMeta
70110fb915a9 searchlines
dwinter
parents:
diff changeset
72
70110fb915a9 searchlines
dwinter
parents:
diff changeset
73 indexMeta=os.path.normpath(indexMeta)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
74 #makesure no index.meta at the end and no /
70110fb915a9 searchlines
dwinter
parents:
diff changeset
75
70110fb915a9 searchlines
dwinter
parents:
diff changeset
76
70110fb915a9 searchlines
dwinter
parents:
diff changeset
77
70110fb915a9 searchlines
dwinter
parents:
diff changeset
78
70110fb915a9 searchlines
dwinter
parents:
diff changeset
79
70110fb915a9 searchlines
dwinter
parents:
diff changeset
80
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
81 queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
82
70110fb915a9 searchlines
dwinter
parents:
diff changeset
83
70110fb915a9 searchlines
dwinter
parents:
diff changeset
84
70110fb915a9 searchlines
dwinter
parents:
diff changeset
85 page= paras.get("pf")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
86 if page:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
87 # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird
70110fb915a9 searchlines
dwinter
parents:
diff changeset
88
70110fb915a9 searchlines
dwinter
parents:
diff changeset
89 head,name=os.path.split(page)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
90
70110fb915a9 searchlines
dwinter
parents:
diff changeset
91 name,ext = os.path.splitext(name)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
92 splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist.
70110fb915a9 searchlines
dwinter
parents:
diff changeset
93
70110fb915a9 searchlines
dwinter
parents:
diff changeset
94 if len(splitted)>1:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
95 name=".".join(splitted[0,-1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
96
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
97 queryString+=" AND pf:%s"%name
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
98
70110fb915a9 searchlines
dwinter
parents:
diff changeset
99
70110fb915a9 searchlines
dwinter
parents:
diff changeset
100
70110fb915a9 searchlines
dwinter
parents:
diff changeset
101 response = self.search(queryString,wt="json")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
102
70110fb915a9 searchlines
dwinter
parents:
diff changeset
103 ret=""
70110fb915a9 searchlines
dwinter
parents:
diff changeset
104 hitId=0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
105 rows=[]
70110fb915a9 searchlines
dwinter
parents:
diff changeset
106
70110fb915a9 searchlines
dwinter
parents:
diff changeset
107 pageSize=self.getPageSize(queryString)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
108 for hit in response:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
109
70110fb915a9 searchlines
dwinter
parents:
diff changeset
110 rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
111
70110fb915a9 searchlines
dwinter
parents:
diff changeset
112 hitId+=1
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
113
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
114
70110fb915a9 searchlines
dwinter
parents:
diff changeset
115
70110fb915a9 searchlines
dwinter
parents:
diff changeset
116
70110fb915a9 searchlines
dwinter
parents:
diff changeset
117
70110fb915a9 searchlines
dwinter
parents:
diff changeset
118
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
119 returnJSON={}
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
120
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
121 returnJSON['rows']=rows
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
122 returnJSON['total']=len(rows)
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
123
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
124 web.header('Content-Type', 'application/json')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
125 web.header('Access-Control-Allow-Origin', '*')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
126 web.header('Access-Control-Allow-Credentials', 'true')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
127
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
128
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
129 return json.dumps(returnJSON)
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
130
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
131
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
132
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
133 def generateRowForJson(self,hitID,hit,query,pageSize):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
134
70110fb915a9 searchlines
dwinter
parents:
diff changeset
135 ret={}
70110fb915a9 searchlines
dwinter
parents:
diff changeset
136 ret["id"]=str(hitID)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
137 ret["text"]=query
70110fb915a9 searchlines
dwinter
parents:
diff changeset
138
70110fb915a9 searchlines
dwinter
parents:
diff changeset
139
70110fb915a9 searchlines
dwinter
parents:
diff changeset
140 splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
70110fb915a9 searchlines
dwinter
parents:
diff changeset
141
70110fb915a9 searchlines
dwinter
parents:
diff changeset
142
70110fb915a9 searchlines
dwinter
parents:
diff changeset
143 try:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
144 x =(int(splitted[1]),int(splitted[2]))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
145 y =(int(splitted[3]),int(splitted[4]))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
146
70110fb915a9 searchlines
dwinter
parents:
diff changeset
147 except:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
148 return ret
70110fb915a9 searchlines
dwinter
parents:
diff changeset
149
70110fb915a9 searchlines
dwinter
parents:
diff changeset
150
70110fb915a9 searchlines
dwinter
parents:
diff changeset
151
70110fb915a9 searchlines
dwinter
parents:
diff changeset
152
70110fb915a9 searchlines
dwinter
parents:
diff changeset
153 x,y = self.calculateRelBoundingBox(x, y, pageSize)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
154
70110fb915a9 searchlines
dwinter
parents:
diff changeset
155 ret["areas"]=[self.generateAreaForJson(x, y)]
70110fb915a9 searchlines
dwinter
parents:
diff changeset
156
70110fb915a9 searchlines
dwinter
parents:
diff changeset
157 return ret
70110fb915a9 searchlines
dwinter
parents:
diff changeset
158
70110fb915a9 searchlines
dwinter
parents:
diff changeset
159 def generateAreaForJson(self,x,y):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
160
70110fb915a9 searchlines
dwinter
parents:
diff changeset
161 area={}
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
162 area["width"]=str(y[0]-x[0])
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
163 area["height"]=str(y[1]-x[1])
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
164 area["y"]=str(x[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
165 area["x"]=str(x[0])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
166
70110fb915a9 searchlines
dwinter
parents:
diff changeset
167 return area
70110fb915a9 searchlines
dwinter
parents:
diff changeset
168
70110fb915a9 searchlines
dwinter
parents:
diff changeset
169 def calculateRelBoundingBox(self,x,y,pageSize):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
170
70110fb915a9 searchlines
dwinter
parents:
diff changeset
171 xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
172 yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
173
70110fb915a9 searchlines
dwinter
parents:
diff changeset
174 return xneu,yneu
70110fb915a9 searchlines
dwinter
parents:
diff changeset
175
70110fb915a9 searchlines
dwinter
parents:
diff changeset
176 def getPageSize(self,queryString):
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
177
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
178 response = self.searchText(queryString,wt="json")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
179
70110fb915a9 searchlines
dwinter
parents:
diff changeset
180 for hit in response: #sollte eigentlich nur einen geben
70110fb915a9 searchlines
dwinter
parents:
diff changeset
181 ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
70110fb915a9 searchlines
dwinter
parents:
diff changeset
182
70110fb915a9 searchlines
dwinter
parents:
diff changeset
183 splitted=ocrPage.split(" ")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
184
70110fb915a9 searchlines
dwinter
parents:
diff changeset
185 try:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
186 x1=int(splitted[0])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
187 x2=int(splitted[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
188 y1=int(splitted[2])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
189 y2=int(splitted[3])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
190
70110fb915a9 searchlines
dwinter
parents:
diff changeset
191 except:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
192 return 0,0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
193
70110fb915a9 searchlines
dwinter
parents:
diff changeset
194 return y1-x1,y2-x2
70110fb915a9 searchlines
dwinter
parents:
diff changeset
195
70110fb915a9 searchlines
dwinter
parents:
diff changeset
196 return 0,0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
197
70110fb915a9 searchlines
dwinter
parents:
diff changeset
198
70110fb915a9 searchlines
dwinter
parents:
diff changeset
199