16
|
1 '''
|
|
2 Created on 16.11.2012
|
|
3
|
|
4 @author: dwinter
|
|
5 '''
|
|
6
|
|
7 import solr
|
|
8 import web
|
|
9 import urllib
|
|
10 import os.path
|
17
|
11 import json
|
18
|
12 import urllib2
|
|
13 import logging
|
16
|
14
|
18
|
15 SOLR_SERVER="https://md.mpiwg-berlin.mpg.de/solr"
|
|
16 DRI_SERVER="http://md.mpiwg-berlin.mpg.de/"
|
16
|
17
|
|
18 class searchLines:
|
|
19
|
|
20 def __init__(self):
|
18
|
21 #logging.basicConfig(filename='/tmp/solr.log',level=logging.DEBUG)
|
|
22 self.con = solr.SolrConnection(SOLR_SERVER,debug=False)
|
16
|
23
|
|
24 self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
|
|
25 self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
|
|
26
|
|
27 def GET(self):
|
|
28 paras = web.input()
|
|
29
|
|
30 if len(paras.keys())==0:
|
18
|
31 raise web.badrequest("Needs at minimum one of the parameters: query,doc,dri,page")
|
16
|
32
|
|
33
|
|
34 queryString=paras.get("query")
|
18
|
35
|
|
36 queryString=urllib2.unquote(queryString)
|
|
37
|
|
38
|
16
|
39 if not queryString:
|
|
40 queryString="q="
|
|
41
|
18
|
42
|
17
|
43 docPath = paras.get("uri")
|
16
|
44 if docPath:
|
18
|
45 docPath=urllib2.unquote(docPath)
|
16
|
46 #make sure that docpath="/mpiwg/online/..."
|
|
47 if not docPath.startswith("/mpiwg/"):
|
|
48 if not docPath.startswith("/mpiwg/online/"):
|
|
49 docPath="/mpiwg/online/"+docPath
|
|
50 else:
|
|
51 docPath="/mpiwg/"+docPath
|
|
52
|
|
53
|
|
54 #makesure no index.meta at the end and no /
|
|
55
|
|
56 docPath=docPath.replace("/index.meta","")
|
|
57 if docPath[-1]=="/":
|
|
58 docPath=docPath[0:-1]
|
|
59
|
|
60 docPath=os.path.normpath(docPath)
|
18
|
61
|
17
|
62 queryString+=""" AND archive-path-folder:"%s" """%docPath
|
16
|
63
|
18
|
64
|
16
|
65
|
|
66 dri = paras.get('dri')
|
|
67 if dri:
|
|
68 f = urllib.urlopen(DRI_SERVER+dri)
|
|
69
|
|
70 indexMeta=f.read()
|
|
71
|
|
72
|
|
73 if f.getcode()==404:
|
|
74 raise web.badrequest("DRI: %s not existing"%dri)
|
|
75
|
|
76
|
|
77 if indexMeta and not indexMeta.startswith("/mpiwg/"):
|
|
78 if not indexMeta.startswith("/mpiwg/online/"):
|
|
79 indexMeta="/mpiwg/online/"+indexMeta
|
|
80 else:
|
|
81 indexMeta="/mpiwg/"+indexMeta
|
|
82
|
|
83 indexMeta=os.path.normpath(indexMeta)
|
|
84 #makesure no index.meta at the end and no /
|
|
85
|
|
86
|
|
87
|
|
88
|
|
89
|
|
90
|
17
|
91 queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta
|
16
|
92
|
18
|
93
|
16
|
94
|
|
95 page= paras.get("pf")
|
|
96 if page:
|
|
97 # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird
|
|
98
|
|
99 head,name=os.path.split(page)
|
|
100
|
|
101 name,ext = os.path.splitext(name)
|
|
102
|
17
|
103 queryString+=" AND pf:%s"%name
|
16
|
104
|
|
105
|
18
|
106
|
|
107 response = self.search(queryString)
|
16
|
108
|
|
109 ret=""
|
|
110 hitId=0
|
|
111 rows=[]
|
18
|
112
|
16
|
113 pageSize=self.getPageSize(queryString)
|
|
114 for hit in response:
|
|
115
|
|
116 rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
|
|
117
|
|
118 hitId+=1
|
17
|
119
|
16
|
120
|
|
121
|
|
122
|
|
123
|
|
124
|
17
|
125 returnJSON={}
|
16
|
126
|
17
|
127 returnJSON['rows']=rows
|
|
128 returnJSON['total']=len(rows)
|
|
129
|
|
130 web.header('Content-Type', 'application/json')
|
|
131 web.header('Access-Control-Allow-Origin', '*')
|
|
132 web.header('Access-Control-Allow-Credentials', 'true')
|
|
133
|
|
134
|
|
135 return json.dumps(returnJSON)
|
|
136
|
|
137
|
|
138
|
16
|
139 def generateRowForJson(self,hitID,hit,query,pageSize):
|
|
140
|
|
141 ret={}
|
|
142 ret["id"]=str(hitID)
|
|
143 ret["text"]=query
|
|
144
|
|
145
|
|
146 splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
|
|
147
|
|
148
|
|
149 try:
|
|
150 x =(int(splitted[1]),int(splitted[2]))
|
|
151 y =(int(splitted[3]),int(splitted[4]))
|
|
152
|
|
153 except:
|
|
154 return ret
|
|
155
|
|
156
|
|
157
|
|
158
|
|
159 x,y = self.calculateRelBoundingBox(x, y, pageSize)
|
|
160
|
|
161 ret["areas"]=[self.generateAreaForJson(x, y)]
|
|
162
|
|
163 return ret
|
|
164
|
|
165 def generateAreaForJson(self,x,y):
|
|
166
|
|
167 area={}
|
17
|
168 area["width"]=str(y[0]-x[0])
|
|
169 area["height"]=str(y[1]-x[1])
|
16
|
170 area["y"]=str(x[1])
|
|
171 area["x"]=str(x[0])
|
|
172
|
|
173 return area
|
|
174
|
|
175 def calculateRelBoundingBox(self,x,y,pageSize):
|
|
176
|
|
177 xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
|
|
178 yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
|
|
179
|
|
180 return xneu,yneu
|
|
181
|
|
182 def getPageSize(self,queryString):
|
17
|
183
|
16
|
184 response = self.searchText(queryString,wt="json")
|
|
185
|
|
186 for hit in response: #sollte eigentlich nur einen geben
|
|
187 ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
|
|
188
|
|
189 splitted=ocrPage.split(" ")
|
|
190
|
|
191 try:
|
|
192 x1=int(splitted[0])
|
|
193 x2=int(splitted[1])
|
|
194 y1=int(splitted[2])
|
|
195 y2=int(splitted[3])
|
|
196
|
|
197 except:
|
|
198 return 0,0
|
|
199
|
|
200 return y1-x1,y2-x2
|
|
201
|
|
202 return 0,0
|
|
203
|
|
204
|
|
205
|