annotate searchService/searchLines.py @ 25:6776aeff118c

added imageURL to the services
author dwinter
date Wed, 13 Mar 2013 10:37:32 +0100
parents 1eb5e3f6444b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
1 '''
70110fb915a9 searchlines
dwinter
parents:
diff changeset
2 Created on 16.11.2012
70110fb915a9 searchlines
dwinter
parents:
diff changeset
3
70110fb915a9 searchlines
dwinter
parents:
diff changeset
4 @author: dwinter
70110fb915a9 searchlines
dwinter
parents:
diff changeset
5 '''
70110fb915a9 searchlines
dwinter
parents:
diff changeset
6
70110fb915a9 searchlines
dwinter
parents:
diff changeset
7 import solr
70110fb915a9 searchlines
dwinter
parents:
diff changeset
8 import web
70110fb915a9 searchlines
dwinter
parents:
diff changeset
9 import urllib
70110fb915a9 searchlines
dwinter
parents:
diff changeset
10 import os.path
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
11 import json
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
12 import urllib2
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
13 import logging
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
14
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
15 SOLR_SERVER="https://md.mpiwg-berlin.mpg.de/solr"
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
16 DRI_SERVER="http://md.mpiwg-berlin.mpg.de/"
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
17
70110fb915a9 searchlines
dwinter
parents:
diff changeset
18 class searchLines:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
19
70110fb915a9 searchlines
dwinter
parents:
diff changeset
20 def __init__(self):
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
21 #logging.basicConfig(filename='/tmp/solr.log',level=logging.DEBUG)
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
22 self.con = solr.SolrConnection(SOLR_SERVER,debug=False)
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
23
70110fb915a9 searchlines
dwinter
parents:
diff changeset
24 self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
25 self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
26
70110fb915a9 searchlines
dwinter
parents:
diff changeset
27 def GET(self):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
28 paras = web.input()
70110fb915a9 searchlines
dwinter
parents:
diff changeset
29
70110fb915a9 searchlines
dwinter
parents:
diff changeset
30 if len(paras.keys())==0:
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
31 raise web.badrequest("Needs at minimum one of the parameters: query,doc,dri,page")
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
32
70110fb915a9 searchlines
dwinter
parents:
diff changeset
33
70110fb915a9 searchlines
dwinter
parents:
diff changeset
34 queryString=paras.get("query")
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
35
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
36 queryString=urllib2.unquote(queryString)
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
37
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
38
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
39 if not queryString:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
40 queryString="q="
70110fb915a9 searchlines
dwinter
parents:
diff changeset
41
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
42
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
43 docPath = paras.get("uri")
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
44 if docPath:
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
45 docPath=urllib2.unquote(docPath)
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
46 #make sure that docpath="/mpiwg/online/..."
70110fb915a9 searchlines
dwinter
parents:
diff changeset
47 if not docPath.startswith("/mpiwg/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
48 if not docPath.startswith("/mpiwg/online/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
49 docPath="/mpiwg/online/"+docPath
70110fb915a9 searchlines
dwinter
parents:
diff changeset
50 else:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
51 docPath="/mpiwg/"+docPath
70110fb915a9 searchlines
dwinter
parents:
diff changeset
52
70110fb915a9 searchlines
dwinter
parents:
diff changeset
53
70110fb915a9 searchlines
dwinter
parents:
diff changeset
54 #makesure no index.meta at the end and no /
70110fb915a9 searchlines
dwinter
parents:
diff changeset
55
70110fb915a9 searchlines
dwinter
parents:
diff changeset
56 docPath=docPath.replace("/index.meta","")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
57 if docPath[-1]=="/":
70110fb915a9 searchlines
dwinter
parents:
diff changeset
58 docPath=docPath[0:-1]
70110fb915a9 searchlines
dwinter
parents:
diff changeset
59
70110fb915a9 searchlines
dwinter
parents:
diff changeset
60 docPath=os.path.normpath(docPath)
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
61
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
62 queryString+=""" AND archive-path-folder:"%s" """%docPath
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
63
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
64
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
65
70110fb915a9 searchlines
dwinter
parents:
diff changeset
66 dri = paras.get('dri')
70110fb915a9 searchlines
dwinter
parents:
diff changeset
67 if dri:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
68 f = urllib.urlopen(DRI_SERVER+dri)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
69
70110fb915a9 searchlines
dwinter
parents:
diff changeset
70 indexMeta=f.read()
70110fb915a9 searchlines
dwinter
parents:
diff changeset
71
70110fb915a9 searchlines
dwinter
parents:
diff changeset
72
70110fb915a9 searchlines
dwinter
parents:
diff changeset
73 if f.getcode()==404:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
74 raise web.badrequest("DRI: %s not existing"%dri)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
75
70110fb915a9 searchlines
dwinter
parents:
diff changeset
76
70110fb915a9 searchlines
dwinter
parents:
diff changeset
77 if indexMeta and not indexMeta.startswith("/mpiwg/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
78 if not indexMeta.startswith("/mpiwg/online/"):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
79 indexMeta="/mpiwg/online/"+indexMeta
70110fb915a9 searchlines
dwinter
parents:
diff changeset
80 else:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
81 indexMeta="/mpiwg/"+indexMeta
70110fb915a9 searchlines
dwinter
parents:
diff changeset
82
70110fb915a9 searchlines
dwinter
parents:
diff changeset
83 indexMeta=os.path.normpath(indexMeta)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
84 #makesure no index.meta at the end and no /
70110fb915a9 searchlines
dwinter
parents:
diff changeset
85
70110fb915a9 searchlines
dwinter
parents:
diff changeset
86
70110fb915a9 searchlines
dwinter
parents:
diff changeset
87
70110fb915a9 searchlines
dwinter
parents:
diff changeset
88
70110fb915a9 searchlines
dwinter
parents:
diff changeset
89
70110fb915a9 searchlines
dwinter
parents:
diff changeset
90
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
91 queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
92
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
93
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
94
70110fb915a9 searchlines
dwinter
parents:
diff changeset
95 page= paras.get("pf")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
96 if page:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
97 # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird
70110fb915a9 searchlines
dwinter
parents:
diff changeset
98
70110fb915a9 searchlines
dwinter
parents:
diff changeset
99 head,name=os.path.split(page)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
100
70110fb915a9 searchlines
dwinter
parents:
diff changeset
101 name,ext = os.path.splitext(name)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
102
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
103 queryString+=" AND pf:%s"%name
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
104
70110fb915a9 searchlines
dwinter
parents:
diff changeset
105
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
106
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
107 response = self.search(queryString)
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
108
70110fb915a9 searchlines
dwinter
parents:
diff changeset
109 ret=""
70110fb915a9 searchlines
dwinter
parents:
diff changeset
110 hitId=0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
111 rows=[]
18
1eb5e3f6444b encoding problems with the redirector solved
dwinter
parents: 17
diff changeset
112
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
113 pageSize=self.getPageSize(queryString)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
114 for hit in response:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
115
70110fb915a9 searchlines
dwinter
parents:
diff changeset
116 rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
117
70110fb915a9 searchlines
dwinter
parents:
diff changeset
118 hitId+=1
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
119
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
120
70110fb915a9 searchlines
dwinter
parents:
diff changeset
121
70110fb915a9 searchlines
dwinter
parents:
diff changeset
122
70110fb915a9 searchlines
dwinter
parents:
diff changeset
123
70110fb915a9 searchlines
dwinter
parents:
diff changeset
124
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
125 returnJSON={}
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
126
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
127 returnJSON['rows']=rows
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
128 returnJSON['total']=len(rows)
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
129
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
130 web.header('Content-Type', 'application/json')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
131 web.header('Access-Control-Allow-Origin', '*')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
132 web.header('Access-Control-Allow-Credentials', 'true')
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
133
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
134
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
135 return json.dumps(returnJSON)
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
136
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
137
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
138
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
139 def generateRowForJson(self,hitID,hit,query,pageSize):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
140
70110fb915a9 searchlines
dwinter
parents:
diff changeset
141 ret={}
70110fb915a9 searchlines
dwinter
parents:
diff changeset
142 ret["id"]=str(hitID)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
143 ret["text"]=query
70110fb915a9 searchlines
dwinter
parents:
diff changeset
144
70110fb915a9 searchlines
dwinter
parents:
diff changeset
145
70110fb915a9 searchlines
dwinter
parents:
diff changeset
146 splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
70110fb915a9 searchlines
dwinter
parents:
diff changeset
147
70110fb915a9 searchlines
dwinter
parents:
diff changeset
148
70110fb915a9 searchlines
dwinter
parents:
diff changeset
149 try:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
150 x =(int(splitted[1]),int(splitted[2]))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
151 y =(int(splitted[3]),int(splitted[4]))
70110fb915a9 searchlines
dwinter
parents:
diff changeset
152
70110fb915a9 searchlines
dwinter
parents:
diff changeset
153 except:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
154 return ret
70110fb915a9 searchlines
dwinter
parents:
diff changeset
155
70110fb915a9 searchlines
dwinter
parents:
diff changeset
156
70110fb915a9 searchlines
dwinter
parents:
diff changeset
157
70110fb915a9 searchlines
dwinter
parents:
diff changeset
158
70110fb915a9 searchlines
dwinter
parents:
diff changeset
159 x,y = self.calculateRelBoundingBox(x, y, pageSize)
70110fb915a9 searchlines
dwinter
parents:
diff changeset
160
70110fb915a9 searchlines
dwinter
parents:
diff changeset
161 ret["areas"]=[self.generateAreaForJson(x, y)]
70110fb915a9 searchlines
dwinter
parents:
diff changeset
162
70110fb915a9 searchlines
dwinter
parents:
diff changeset
163 return ret
70110fb915a9 searchlines
dwinter
parents:
diff changeset
164
70110fb915a9 searchlines
dwinter
parents:
diff changeset
165 def generateAreaForJson(self,x,y):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
166
70110fb915a9 searchlines
dwinter
parents:
diff changeset
167 area={}
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
168 area["width"]=str(y[0]-x[0])
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
169 area["height"]=str(y[1]-x[1])
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
170 area["y"]=str(x[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
171 area["x"]=str(x[0])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
172
70110fb915a9 searchlines
dwinter
parents:
diff changeset
173 return area
70110fb915a9 searchlines
dwinter
parents:
diff changeset
174
70110fb915a9 searchlines
dwinter
parents:
diff changeset
175 def calculateRelBoundingBox(self,x,y,pageSize):
70110fb915a9 searchlines
dwinter
parents:
diff changeset
176
70110fb915a9 searchlines
dwinter
parents:
diff changeset
177 xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
178 yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
179
70110fb915a9 searchlines
dwinter
parents:
diff changeset
180 return xneu,yneu
70110fb915a9 searchlines
dwinter
parents:
diff changeset
181
70110fb915a9 searchlines
dwinter
parents:
diff changeset
182 def getPageSize(self,queryString):
17
64d6ac1a1354 parameter for search changed
dwinter
parents: 16
diff changeset
183
16
70110fb915a9 searchlines
dwinter
parents:
diff changeset
184 response = self.searchText(queryString,wt="json")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
185
70110fb915a9 searchlines
dwinter
parents:
diff changeset
186 for hit in response: #sollte eigentlich nur einen geben
70110fb915a9 searchlines
dwinter
parents:
diff changeset
187 ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
70110fb915a9 searchlines
dwinter
parents:
diff changeset
188
70110fb915a9 searchlines
dwinter
parents:
diff changeset
189 splitted=ocrPage.split(" ")
70110fb915a9 searchlines
dwinter
parents:
diff changeset
190
70110fb915a9 searchlines
dwinter
parents:
diff changeset
191 try:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
192 x1=int(splitted[0])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
193 x2=int(splitted[1])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
194 y1=int(splitted[2])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
195 y2=int(splitted[3])
70110fb915a9 searchlines
dwinter
parents:
diff changeset
196
70110fb915a9 searchlines
dwinter
parents:
diff changeset
197 except:
70110fb915a9 searchlines
dwinter
parents:
diff changeset
198 return 0,0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
199
70110fb915a9 searchlines
dwinter
parents:
diff changeset
200 return y1-x1,y2-x2
70110fb915a9 searchlines
dwinter
parents:
diff changeset
201
70110fb915a9 searchlines
dwinter
parents:
diff changeset
202 return 0,0
70110fb915a9 searchlines
dwinter
parents:
diff changeset
203
70110fb915a9 searchlines
dwinter
parents:
diff changeset
204
70110fb915a9 searchlines
dwinter
parents:
diff changeset
205