16
|
1 '''
|
|
2 Created on 16.11.2012
|
|
3
|
|
4 @author: dwinter
|
|
5 '''
|
|
6
|
|
7 import solr
|
|
8 import web
|
|
9 import urllib
|
|
10 import os.path
|
17
|
11 import json
|
16
|
12
|
|
13 SOLR_SERVER="http://localhost:8983/solr"
|
|
14 DRI_SERVER="http://localhost:8080/purl/"
|
|
15
|
|
16 class searchLines:
|
|
17
|
|
18 def __init__(self):
|
|
19 self.con = solr.SolrConnection(SOLR_SERVER)
|
|
20
|
|
21 self.search = solr.SearchHandler(self.con,"/fulltexts-line/select")
|
|
22 self.searchText = solr.SearchHandler(self.con,"/fulltexts/select")
|
|
23
|
|
24 def GET(self):
|
|
25 paras = web.input()
|
|
26
|
|
27 if len(paras.keys())==0:
|
|
28 raise web.badrequest("Needs at minimum one of the parametrs: query,doc,dri,page")
|
|
29
|
|
30
|
|
31 queryString=paras.get("query")
|
|
32 if not queryString:
|
|
33 queryString="q="
|
|
34
|
17
|
35 docPath = paras.get("uri")
|
16
|
36 if docPath:
|
|
37 #make sure that docpath="/mpiwg/online/..."
|
|
38 if not docPath.startswith("/mpiwg/"):
|
|
39 if not docPath.startswith("/mpiwg/online/"):
|
|
40 docPath="/mpiwg/online/"+docPath
|
|
41 else:
|
|
42 docPath="/mpiwg/"+docPath
|
|
43
|
|
44
|
|
45 #makesure no index.meta at the end and no /
|
|
46
|
|
47 docPath=docPath.replace("/index.meta","")
|
|
48 if docPath[-1]=="/":
|
|
49 docPath=docPath[0:-1]
|
|
50
|
|
51 docPath=os.path.normpath(docPath)
|
17
|
52 queryString+=""" AND archive-path-folder:"%s" """%docPath
|
16
|
53
|
|
54
|
|
55
|
|
56 dri = paras.get('dri')
|
|
57 if dri:
|
|
58 f = urllib.urlopen(DRI_SERVER+dri)
|
|
59
|
|
60 indexMeta=f.read()
|
|
61
|
|
62
|
|
63 if f.getcode()==404:
|
|
64 raise web.badrequest("DRI: %s not existing"%dri)
|
|
65
|
|
66
|
|
67 if indexMeta and not indexMeta.startswith("/mpiwg/"):
|
|
68 if not indexMeta.startswith("/mpiwg/online/"):
|
|
69 indexMeta="/mpiwg/online/"+indexMeta
|
|
70 else:
|
|
71 indexMeta="/mpiwg/"+indexMeta
|
|
72
|
|
73 indexMeta=os.path.normpath(indexMeta)
|
|
74 #makesure no index.meta at the end and no /
|
|
75
|
|
76
|
|
77
|
|
78
|
|
79
|
|
80
|
17
|
81 queryString+=' AND archive-path-indexMeta:"%s"'%indexMeta
|
16
|
82
|
|
83
|
|
84
|
|
85 page= paras.get("pf")
|
|
86 if page:
|
|
87 # im verzeichnis steht nur der seiten name nicht der pfad daher nur das ende falls "pageimg/xxx" ubergeben wird
|
|
88
|
|
89 head,name=os.path.split(page)
|
|
90
|
|
91 name,ext = os.path.splitext(name)
|
|
92 splitted=name.split(".") # schneide ausserdem das sufffix ab, falls eins da ist.
|
|
93
|
|
94 if len(splitted)>1:
|
|
95 name=".".join(splitted[0,-1])
|
|
96
|
17
|
97 queryString+=" AND pf:%s"%name
|
16
|
98
|
|
99
|
|
100
|
|
101 response = self.search(queryString,wt="json")
|
|
102
|
|
103 ret=""
|
|
104 hitId=0
|
|
105 rows=[]
|
|
106
|
|
107 pageSize=self.getPageSize(queryString)
|
|
108 for hit in response:
|
|
109
|
|
110 rows.append(self.generateRowForJson(hitId, hit, queryString, pageSize))
|
|
111
|
|
112 hitId+=1
|
17
|
113
|
16
|
114
|
|
115
|
|
116
|
|
117
|
|
118
|
17
|
119 returnJSON={}
|
16
|
120
|
17
|
121 returnJSON['rows']=rows
|
|
122 returnJSON['total']=len(rows)
|
|
123
|
|
124 web.header('Content-Type', 'application/json')
|
|
125 web.header('Access-Control-Allow-Origin', '*')
|
|
126 web.header('Access-Control-Allow-Credentials', 'true')
|
|
127
|
|
128
|
|
129 return json.dumps(returnJSON)
|
|
130
|
|
131
|
|
132
|
16
|
133 def generateRowForJson(self,hitID,hit,query,pageSize):
|
|
134
|
|
135 ret={}
|
|
136 ret["id"]=str(hitID)
|
|
137 ret["text"]=query
|
|
138
|
|
139
|
|
140 splitted=hit.get("bbox").rstrip().lstrip().split(" ") #format ist bbox x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
|
|
141
|
|
142
|
|
143 try:
|
|
144 x =(int(splitted[1]),int(splitted[2]))
|
|
145 y =(int(splitted[3]),int(splitted[4]))
|
|
146
|
|
147 except:
|
|
148 return ret
|
|
149
|
|
150
|
|
151
|
|
152
|
|
153 x,y = self.calculateRelBoundingBox(x, y, pageSize)
|
|
154
|
|
155 ret["areas"]=[self.generateAreaForJson(x, y)]
|
|
156
|
|
157 return ret
|
|
158
|
|
159 def generateAreaForJson(self,x,y):
|
|
160
|
|
161 area={}
|
17
|
162 area["width"]=str(y[0]-x[0])
|
|
163 area["height"]=str(y[1]-x[1])
|
16
|
164 area["y"]=str(x[1])
|
|
165 area["x"]=str(x[0])
|
|
166
|
|
167 return area
|
|
168
|
|
169 def calculateRelBoundingBox(self,x,y,pageSize):
|
|
170
|
|
171 xneu=(float(x[0])/pageSize[0],float(x[1])/pageSize[1])
|
|
172 yneu=(float(y[0])/pageSize[0],float(y[1])/pageSize[1])
|
|
173
|
|
174 return xneu,yneu
|
|
175
|
|
176 def getPageSize(self,queryString):
|
17
|
177
|
16
|
178 response = self.searchText(queryString,wt="json")
|
|
179
|
|
180 for hit in response: #sollte eigentlich nur einen geben
|
|
181 ocrPage=hit.get("ocr_page").lstrip().rstrip() #format ist x1 x2 y1 y2 mit x linke obere Ecke, y rechte untere ecke,
|
|
182
|
|
183 splitted=ocrPage.split(" ")
|
|
184
|
|
185 try:
|
|
186 x1=int(splitted[0])
|
|
187 x2=int(splitted[1])
|
|
188 y1=int(splitted[2])
|
|
189 y2=int(splitted[3])
|
|
190
|
|
191 except:
|
|
192 return 0,0
|
|
193
|
|
194 return y1-x1,y2-x2
|
|
195
|
|
196 return 0,0
|
|
197
|
|
198
|
|
199
|