Changeset 508:d5a47f82e755 in documentViewer for MpdlXmlTextServer.py
- Timestamp:
- Feb 27, 2012, 8:26:52 PM (12 years ago)
- Branch:
- elementtree
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r506 r508 127 127 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): 128 128 """returns single page from fulltext""" 129 129 130 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) 130 131 # check for cached text -- but ideally this shouldn't be called twice … … 139 140 140 141 # stuff for constructing full urls 141 url = docinfo['url']142 urlmode = docinfo['mode']143 sn = pageinfo.get('sn', None)144 highlightQuery = pageinfo.get('highlightQuery', None)145 tocMode = pageinfo.get('tocMode', None)146 tocPN = pageinfo.get('tocPN',None)147 142 characterNormalization = pageinfo.get('characterNormalization', None) 148 143 moreTextParam = '' 149 144 selfurl = docinfo['viewerUrl'] 150 145 151 if mode == "dict" or mode == "text_dict": 146 if not mode: 147 # default is dict 148 mode = 'text' 149 150 modes = mode.split(',') 151 # check for multiple layers 152 if len(modes) > 1: 153 logging.debug("getTextPage: more than one mode=%s"%mode) 154 155 # search mode 156 if 'search' in modes: 157 # add highlighting 158 highlightQuery = pageinfo.get('highlightQuery', None) 159 sn = pageinfo.get('sn', None) 160 if highlightQuery and sn: 161 moreTextParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) 162 163 # remove mode 164 modes.remove('search') 165 166 # other modes don't combine 167 if 'dict' in modes: 152 168 # dict is called textPollux in the backend 153 textmode = "textPollux" 154 elif not mode: 155 # default is text 156 mode = "text" 157 textmode = "text" 169 textmode = 'textPollux' 170 elif len(modes) == 0: 171 # text is default mode 172 textmode = 'text' 158 173 else: 159 textmode = mode 174 # just take first mode 175 textmode = modes[0] 160 176 161 177 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) 162 if highlightQuery: 163 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) 178 textParam += moreTextParam 164 179 165 180 # fetch the page … … 181 196 182 197 # plain text mode 183 if mode == "text":198 if textmode == "text": 184 199 # get full url assuming documentViewer is parent 185 200 selfurl = self.getLink() … … 195 210 196 211 # text-with-links mode 197 elif mode == "dict":212 elif textmode == "textPollux": 198 213 if pagediv is not None: 199 214 viewerurl = docinfo['viewerUrl'] … … 227 242 228 243 # xml mode 229 elif mode == "xml":244 elif textmode == "xml": 230 245 if pagediv is not None: 231 246 return serialize(pagediv) 232 247 233 248 # pureXml mode 234 elif mode == "pureXml":249 elif textmode == "pureXml": 235 250 if pagediv is not None: 236 251 return serialize(pagediv) 237 252 238 253 # gis mode 239 elif mode == "gis":254 elif textmode == "gis": 240 255 if pagediv is not None: 241 256 # check all a-tags … … 255 270 return None 256 271 272 273 def getSearchResults(self, mode, query=None, docinfo=None): 274 """loads list of search results and stores XML in docinfo""" 275 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) 276 if mode == "none": 277 return docinfo 278 279 if 'resultSize_%s_%s'%(mode,query) in docinfo: 280 # cached result 281 return docinfo 282 283 docpath = docinfo['textURLPath'] 284 # we need to set a result set size 285 pagesize = 1000 286 pn = 1 287 # fetch full results 288 params = {'document': docpath, 289 'mode': 'text', 290 'queryType': mode, 291 'query': query, 292 'queryResultPageSize': 1000, 293 'queryResultPN': 1} 294 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) 295 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) 296 dom = ET.fromstring(pagexml) 297 # page content is in <div class="queryResultPage"> 298 pagediv = None 299 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] 300 alldivs = dom.findall("div") 301 for div in alldivs: 302 dc = div.get('class') 303 # page content div 304 if dc == 'queryResultPage': 305 pagediv = div 306 307 elif dc == 'queryResultHits': 308 docinfo['resultSize_%s_%s'%(mode,query)] = getInt(div.text) 309 310 if pagediv: 311 # store XML in docinfo 312 docinfo['resultXML_%s_%s'%(mode,query)] = ET.tostring(pagediv, 'UTF-8') 313 314 logging.debug("getSearchResults: pagediv=%s"%pagediv) 315 return docinfo 316 317 318 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): 319 """returns single page from the table of contents""" 320 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) 321 # check for cached TOC 322 #TODO: cache only one search 323 if not docinfo.has_key('resultXML_%s_%s'%(mode,query)): 324 self.getSearchResults(mode=mode, query=query, docinfo=docinfo) 325 326 resultxml = docinfo.get('resultXML_%s_%s'%(mode,query), None) 327 if not resultxml: 328 logging.error("getResultPage: unable to find resultXML") 329 return "Error: no result!" 330 331 if size is None: 332 size = pageinfo.get('searchResultPageSize', 20) 333 334 if start is None: 335 start = (pn - 1) * size 336 337 fullresult = ET.fromstring(resultxml) 338 339 if fullresult: 340 # paginate 341 first = start 342 len = size 343 del fullresult[:first] 344 del fullresult[len:] 345 tocdivs = fullresult 346 347 # check all a-tags 348 links = tocdivs.findall(".//a") 349 for l in links: 350 href = l.get('href') 351 if href: 352 # take pn from href 353 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) 354 if m is not None: 355 # and create new url (assuming parent is documentViewer) 356 #TODO: add highlighting params 357 url = self.getLink('pn', m.group(1)) 358 l.set('href', url) 359 else: 360 logging.warning("getResultPage: Problem with link=%s"%href) 361 362 # fix two-divs-per-row with containing div 363 # newtoc = ET.Element('div', {'class':'queryResultPage'}) 364 # for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): 365 # e = ET.Element('div',{'class':'tocline'}) 366 # e.append(d1) 367 # e.append(d2) 368 # newtoc.append(e) 369 370 return serialize(tocdivs) 371 372 return "ERROR: no results!" 373 257 374 258 375 def getToc(self, mode="text", docinfo=None):
Note: See TracChangeset
for help on using the changeset viewer.