Mercurial > hg > documentViewer
annotate MpiwgXmlTextServer.py @ 626:7dafe8283312
make sslify accessible for templates.
author | casties |
---|---|
date | Mon, 15 Dec 2014 17:31:08 +0100 |
parents | c57d80a649ea |
children | 4a75a760def2 |
rev | line source |
---|---|
564 | 1 from OFS.SimpleItem import SimpleItem |
2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile | |
3 | |
4 import xml.etree.ElementTree as ET | |
5 | |
6 import re | |
7 import logging | |
8 import urllib | |
9 import urlparse | |
10 import base64 | |
11 | |
576 | 12 from datetime import datetime |
13 | |
613
c57d80a649ea
CLOSED - # 281: List of thumbnails verschluckt Seite, wenn odd-scan-position gesetzt ist
casties
parents:
610
diff
changeset
|
14 from SrvTxtUtils import getInt, getText, getHttpData, serialize |
564 | 15 |
610 | 16 # mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo |
17 textinfoFieldMap = { | |
18 'countPages' : 'numTextPages', | |
19 'countFigures' : 'numFigureEntries', | |
20 'countNotesHandwritten' : 'numHandwritten', | |
21 'countNotes' : 'numNotes', | |
22 'countPlaces' : 'numPlaces', | |
23 'countTocEntries' : 'numTocEntries' | |
24 } | |
25 | |
564 | 26 |
27 class MpiwgXmlTextServer(SimpleItem): | |
28 """TextServer implementation for MPIWG-XML server""" | |
29 meta_type="MPIWG-XML TextServer" | |
30 | |
31 manage_options=( | |
32 {'label':'Config','action':'manage_changeMpiwgXmlTextServerForm'}, | |
33 )+SimpleItem.manage_options | |
34 | |
35 manage_changeMpiwgXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpiwgXmlTextServer", globals()) | |
36 | |
37 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpiwg-mpdl-cms-web/", timeout=40, serverName=None, repositoryType='production'): | |
38 """constructor""" | |
39 self.id=id | |
40 self.title=title | |
41 self.timeout = timeout | |
42 self.repositoryType = repositoryType | |
43 if serverName is None: | |
44 self.serverUrl = serverUrl | |
45 else: | |
46 self.serverUrl = "http://%s/mpiwg-mpdl-cms-web/"%serverName | |
47 | |
48 def getHttpData(self, url, data=None): | |
49 """returns result from url+data HTTP request""" | |
50 return getHttpData(url,data,timeout=self.timeout) | |
51 | |
52 def getServerData(self, method, data=None): | |
53 """returns result from text server for method+data""" | |
54 url = self.serverUrl+method | |
55 return getHttpData(url,data,timeout=self.timeout) | |
56 | |
57 | |
58 def getRepositoryType(self): | |
59 """returns the repository type, e.g. 'production'""" | |
572 | 60 return getattr(self, 'repositoryType', None) |
564 | 61 |
62 def getTextDownloadUrl(self, type='xml', docinfo=None): | |
63 """returns a URL to download the current text""" | |
64 docpath = docinfo.get('textURLPath', None) | |
65 if not docpath: | |
66 return None | |
67 | |
68 docpath = docpath.replace('.xml','.'+type) | |
69 url = '%sdoc/GetDocument?id=%s'%(self.serverUrl.replace('interface/',''), docpath) | |
70 return url | |
71 | |
72 | |
73 def getPlacesOnPage(self, docinfo=None, pn=None): | |
74 """Returns list of GIS places of page pn""" | |
610 | 75 logging.debug("getPlacesOnPage(pn=%s"%pn) |
76 if not 'places' in docinfo: | |
77 self.getTextInfo('places', docinfo) | |
78 | |
79 allplaces = docinfo.get('places', None) | |
80 if len(allplaces) == 0: | |
81 return [] | |
82 | |
83 # search for places on this page TODO: is there a better way? | |
84 places = [p for p in allplaces if p['pn'] == pn] | |
85 return places | |
86 """OLD: | |
564 | 87 docpath = docinfo.get('textURLPath',None) |
88 if not docpath: | |
89 return None | |
90 | |
91 places=[] | |
92 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) | |
93 dom = ET.fromstring(text) | |
94 result = dom.findall(".//resultPage/place") | |
95 for l in result: | |
96 id = l.get("id") | |
97 name = l.text | |
98 place = {'id': id, 'name': name} | |
99 places.append(place) | |
100 | |
610 | 101 return places""" |
564 | 102 |
103 | |
565 | 104 def getTextInfo(self, mode=None, docinfo=None): |
564 | 105 """reads document info, including page concordance, from text server""" |
106 logging.debug("getTextInfo mode=%s"%mode) | |
565 | 107 |
108 field = '' | |
610 | 109 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']: |
565 | 110 # translate mode to field param |
609 | 111 if mode == 'handwritten': |
112 field = '&field=notesHandwritten' | |
113 else: | |
114 field = '&field=%s'%mode | |
565 | 115 else: |
116 mode = None | |
117 | |
564 | 118 # check cached info |
119 if mode: | |
120 # cached toc-request? | |
121 if 'full_%s'%mode in docinfo: | |
122 return docinfo | |
123 | |
124 else: | |
565 | 125 # cached but no toc-request? |
564 | 126 if 'numTextPages' in docinfo: |
127 return docinfo | |
128 | |
129 docpath = docinfo.get('textURLPath', None) | |
130 if docpath is None: | |
131 logging.error("getTextInfo: no textURLPath!") | |
132 return docinfo | |
133 | |
134 # fetch docinfo | |
565 | 135 pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) |
564 | 136 dom = ET.fromstring(pagexml) |
565 | 137 # all info in tag <doc> |
138 doc = dom | |
564 | 139 if doc is None: |
140 logging.error("getTextInfo: unable to find document-tag!") | |
141 else: | |
565 | 142 if mode is None: |
143 # get general info from system-tag | |
568 | 144 sys = doc.find('system') |
145 if sys is not None: | |
610 | 146 for (k,v) in textinfoFieldMap.items(): |
147 # copy into docinfo (even if empty) | |
148 docinfo[v] = getInt(getText(sys.find(k))) | |
565 | 149 |
150 else: | |
151 # result is in list-tag | |
152 l = doc.find('list') | |
153 if l is not None: | |
610 | 154 # look for general info |
155 for (k,v) in textinfoFieldMap.items(): | |
156 # copy into docinfo (only if not empty) | |
157 s = doc.find(k) | |
158 if s is not None: | |
159 docinfo[v] = getInt(getText(s)) | |
160 | |
565 | 161 lt = l.get('type') |
610 | 162 # |
565 | 163 # pageNumbers |
610 | 164 # |
565 | 165 if lt == 'pages': |
166 # contains tags with page numbers | |
167 # <item n="14" o="2" o-norm="2" file="0014"/> | |
168 # n=scan number, o=original page no, on=normalized original page no | |
169 # pageNumbers is a dict indexed by scan number | |
170 pages = {} | |
171 for i in l: | |
172 page = {} | |
173 pn = getInt(i.get('n')) | |
174 page['pn'] = pn | |
568 | 175 no = i.get('o') |
565 | 176 page['no'] = no |
568 | 177 non = i.get('o-norm') |
565 | 178 page['non'] = non |
179 | |
180 if pn > 0: | |
181 pages[pn] = page | |
182 | |
183 docinfo['pageNumbers'] = pages | |
610 | 184 |
185 # | |
565 | 186 # toc |
610 | 187 # |
609 | 188 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: |
565 | 189 # contains tags with table of contents/figures |
568 | 190 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> |
565 | 191 tocs = [] |
568 | 192 for te in l: |
193 if te.tag == 'item': | |
194 toc = {} | |
195 toc['level-string'] = te.get('n') | |
196 toc['level'] = te.get('lv') | |
197 toc['content'] = te.text.strip() | |
198 ref = te.find('ref') | |
199 toc['pn'] = getInt(ref.text) | |
200 toc['no'] = ref.get('o') | |
201 toc['non'] = ref.get('o-norm') | |
202 tocs.append(toc) | |
564 | 203 |
565 | 204 # save as full_toc/full_figures |
205 docinfo['full_%s'%mode] = tocs | |
564 | 206 |
610 | 207 # |
208 # places | |
209 # | |
210 # | |
211 # toc | |
212 # | |
213 elif lt in ['places']: | |
214 # contains tags with place-ids | |
215 # <item id="N40004F-01"><ref>4</ref></item> | |
216 places = [] | |
217 for p in l: | |
218 if p.tag == 'item': | |
219 place = {} | |
220 place['id'] = p.get('id') | |
221 ref = p.find('ref') | |
222 place['pn'] = getInt(ref.text) | |
223 places.append(place) | |
224 | |
225 docinfo['places'] = places | |
226 | |
564 | 227 return docinfo |
228 | |
229 | |
230 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): | |
231 """returns single page from fulltext""" | |
232 | |
233 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) | |
576 | 234 startTime = datetime.now() |
564 | 235 # check for cached text -- but ideally this shouldn't be called twice |
236 if pageinfo.has_key('textPage'): | |
237 logging.debug("getTextPage: using cached text") | |
238 return pageinfo['textPage'] | |
239 | |
240 docpath = docinfo.get('textURLPath', None) | |
241 if not docpath: | |
242 return None | |
243 | |
244 # stuff for constructing full urls | |
245 selfurl = docinfo['viewerUrl'] | |
246 textParams = {'docId': docpath, | |
247 'page': pn} | |
565 | 248 |
575 | 249 normMode = pageinfo.get('characterNormalization', 'reg') |
250 # TODO: change values in form | |
251 if normMode == 'regPlusNorm': | |
252 normMode = 'norm' | |
576 | 253 |
254 # TODO: this should not be necessary when the backend is fixed | |
579 | 255 #textParams['normalization'] = normMode |
576 | 256 |
564 | 257 if not mode: |
258 # default is dict | |
259 mode = 'text' | |
260 | |
261 modes = mode.split(',') | |
262 # check for multiple layers | |
263 if len(modes) > 1: | |
264 logging.debug("getTextPage: more than one mode=%s"%mode) | |
265 | |
610 | 266 # mode defaults |
267 gisMode = False | |
268 punditMode = False | |
269 | |
564 | 270 # search mode |
271 if 'search' in modes: | |
272 # add highlighting | |
273 highlightQuery = pageinfo.get('highlightQuery', None) | |
274 if highlightQuery: | |
275 textParams['highlightQuery'] = highlightQuery | |
276 textParams['highlightElem'] = pageinfo.get('highlightElement', '') | |
277 textParams['highlightElemPos'] = pageinfo.get('highlightElementPos', '') | |
278 | |
279 # ignore mode in the following | |
280 modes.remove('search') | |
281 | |
282 # pundit mode | |
283 if 'pundit' in modes: | |
284 punditMode = True | |
285 # ignore mode in the following | |
286 modes.remove('pundit') | |
287 | |
288 # other modes don't combine | |
289 if 'dict' in modes: | |
290 textmode = 'dict' | |
291 textParams['outputFormat'] = 'html' | |
292 elif 'xml' in modes: | |
293 textmode = 'xml' | |
294 textParams['outputFormat'] = 'xmlDisplay' | |
576 | 295 normMode = 'orig' |
564 | 296 elif 'gis' in modes: |
610 | 297 gisMode = True |
298 # gis mode uses plain text | |
299 textmode = 'plain' | |
300 textParams['outputFormat'] = 'html' | |
564 | 301 else: |
302 # text is default mode | |
575 | 303 textmode = 'plain' |
564 | 304 textParams['outputFormat'] = 'html' |
305 | |
565 | 306 try: |
570 | 307 # fetch the page |
308 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) | |
565 | 309 dom = ET.fromstring(pagexml) |
310 except Exception, e: | |
570 | 311 logging.error("Error reading page: %s"%e) |
565 | 312 return None |
313 | |
566 | 314 # plain text or text-with-links mode |
610 | 315 if textmode == 'plain' or textmode == 'dict': |
574 | 316 # the text is in div@class=text |
317 pagediv = dom.find(".//div[@class='text']") | |
318 logging.debug("pagediv: %s"%repr(pagediv)) | |
564 | 319 if pagediv is not None: |
575 | 320 # add textmode and normMode classes |
579 | 321 #pagediv.set('class', 'text %s %s'%(textmode, normMode)) |
576 | 322 self._processWTags(textmode, normMode, pagediv) |
567 | 323 #self._processPbTag(pagediv, pageinfo) |
566 | 324 self._processFigures(pagediv, docinfo) |
325 #self._fixEmptyDivs(pagediv) | |
565 | 326 # get full url assuming documentViewer is parent |
327 selfurl = self.getLink() | |
564 | 328 # check all a-tags |
329 links = pagediv.findall('.//a') | |
330 for l in links: | |
331 href = l.get('href') | |
332 if href: | |
333 # is link with href | |
334 linkurl = urlparse.urlparse(href) | |
335 if linkurl.path.endswith('GetDictionaryEntries'): | |
336 #TODO: replace wordInfo page | |
337 # add target to open new page | |
338 l.set('target', '_blank') | |
566 | 339 |
340 if punditMode: | |
341 self._addPunditAttributes(pagediv, pageinfo, docinfo) | |
577 | 342 |
610 | 343 if gisMode: |
344 self._addGisTags(pagediv, pageinfo, docinfo) | |
345 | |
576 | 346 s = serialize(pagediv) |
347 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) | |
348 return s | |
564 | 349 |
350 # xml mode | |
351 elif textmode == "xml": | |
574 | 352 # the text is in body |
353 pagediv = dom.find(".//body") | |
354 logging.debug("pagediv: %s"%repr(pagediv)) | |
564 | 355 if pagediv is not None: |
356 return serialize(pagediv) | |
357 | |
579 | 358 logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) |
564 | 359 return None |
565 | 360 |
575 | 361 def _processWTags(self, textMode, normMode, pagediv): |
362 """selects the necessary information from w-spans and removes the rest from pagediv""" | |
363 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) | |
576 | 364 startTime = datetime.now() |
575 | 365 wtags = pagediv.findall(".//span[@class='w']") |
366 for wtag in wtags: | |
367 if textMode == 'dict': | |
576 | 368 # delete non-a-tags |
369 wtag.remove(wtag.find("span[@class='nodictionary orig']")) | |
370 wtag.remove(wtag.find("span[@class='nodictionary reg']")) | |
371 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
372 # delete non-matching children of a-tag and suppress remaining tag name | |
579 | 373 atag = wtag.find("*[@class='dictionary']") |
576 | 374 if normMode == 'orig': |
375 atag.remove(atag.find("span[@class='reg']")) | |
376 atag.remove(atag.find("span[@class='norm']")) | |
377 atag.find("span[@class='orig']").tag = None | |
378 elif normMode == 'reg': | |
379 atag.remove(atag.find("span[@class='orig']")) | |
380 atag.remove(atag.find("span[@class='norm']")) | |
381 atag.find("span[@class='reg']").tag = None | |
382 elif normMode == 'norm': | |
383 atag.remove(atag.find("span[@class='orig']")) | |
384 atag.remove(atag.find("span[@class='reg']")) | |
385 atag.find("span[@class='norm']").tag = None | |
386 | |
575 | 387 else: |
576 | 388 # delete a-tag |
579 | 389 wtag.remove(wtag.find("*[@class='dictionary']")) |
576 | 390 # delete non-matching children and suppress remaining tag name |
391 if normMode == 'orig': | |
392 wtag.remove(wtag.find("span[@class='nodictionary reg']")) | |
393 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
394 wtag.find("span[@class='nodictionary orig']").tag = None | |
395 elif normMode == 'reg': | |
396 wtag.remove(wtag.find("span[@class='nodictionary orig']")) | |
397 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
398 wtag.find("span[@class='nodictionary reg']").tag = None | |
399 elif normMode == 'norm': | |
400 wtag.remove(wtag.find("span[@class='nodictionary orig']")) | |
401 wtag.remove(wtag.find("span[@class='nodictionary reg']")) | |
402 wtag.find("span[@class='nodictionary norm']").tag = None | |
575 | 403 |
576 | 404 # suppress w-tag name |
405 wtag.tag = None | |
406 | |
407 logging.debug("processWTags in %s"%(datetime.now()-startTime)) | |
575 | 408 return pagediv |
409 | |
566 | 410 def _processPbTag(self, pagediv, pageinfo): |
565 | 411 """extracts information from pb-tag and removes it from pagediv""" |
412 pbdiv = pagediv.find(".//span[@class='pb']") | |
413 if pbdiv is None: | |
414 logging.warning("getTextPage: no pb-span!") | |
415 return pagediv | |
416 | |
417 # extract running head | |
418 rh = pbdiv.find(".//span[@class='rhead']") | |
419 if rh is not None: | |
420 pageinfo['pageHeaderTitle'] = getText(rh) | |
421 | |
422 # remove pb-div from parent | |
423 ppdiv = pagediv.find(".//span[@class='pb']/..") | |
424 ppdiv.remove(pbdiv) | |
425 return pagediv | |
564 | 426 |
565 | 427 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): |
610 | 428 """add about-attributes to divs for pundit annotation tool""" |
564 | 429 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) |
430 pn = pageinfo.get('pn', '1') | |
431 # check all div-tags | |
432 divs = pagediv.findall(".//div") | |
433 for d in divs: | |
434 id = d.get('id') | |
435 if id: | |
566 | 436 # TODO: check path (cf RFC2396) |
564 | 437 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) |
438 cls = d.get('class','') | |
439 cls += ' pundit-content' | |
440 d.set('class', cls.strip()) | |
441 | |
442 return pagediv | |
443 | |
610 | 444 def _addGisTags(self, pagediv, pageinfo, docinfo): |
445 """add links for gis places""" | |
446 # use last part of documentPath as db-id | |
447 docpath = docinfo.get('documentPath', '') | |
448 textid = docpath.split('/')[-1] | |
449 # add our URL as backlink | |
450 selfurl = self.getLink() | |
451 doc = base64.b64encode(selfurl) | |
452 # check all span@class=place | |
453 spans = pagediv.findall(".//span[@class='place']") | |
454 for s in spans: | |
455 id = s.get('id') | |
456 if id: | |
457 # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis | |
458 s.tag = 'a' | |
459 # TODO: make links configurable | |
460 url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc) | |
461 s.set('href', url) | |
462 s.set('target', '_blank') | |
463 | |
464 return pagediv | |
465 | |
566 | 466 def _processFigures(self, pagediv, docinfo): |
467 """processes figure-tags""" | |
576 | 468 # unfortunately etree can not select class.startswith('figure') |
469 divs = pagediv.findall(".//span[@class]") | |
566 | 470 scalerUrl = docinfo['digilibScalerUrl'] |
471 viewerUrl = docinfo['digilibViewerUrl'] | |
472 for d in divs: | |
576 | 473 if not d.get('class').startswith('figure'): |
474 continue | |
475 | |
566 | 476 try: |
477 a = d.find('a') | |
478 img = a.find('img') | |
479 imgsrc = img.get('src') | |
480 imgurl = urlparse.urlparse(imgsrc) | |
481 imgq = imgurl.query | |
482 imgparams = urlparse.parse_qs(imgq) | |
483 fn = imgparams.get('fn', None) | |
484 if fn is not None: | |
485 # parse_qs puts parameters in lists | |
486 fn = fn[0] | |
487 # TODO: check valid path | |
488 # fix img@src | |
489 newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn) | |
490 img.set('src', newsrc) | |
491 # fix a@href | |
492 newlink = '%s?fn=%s'%(viewerUrl,fn) | |
493 a.set('href', newlink) | |
494 a.set('target', '_blank') | |
495 | |
496 except: | |
497 logging.warn("processFigures: strange figure!") | |
498 | |
583 | 499 |
500 def _cleanSearchResult(self, pagediv): | |
501 """fixes search result html (change pbs and figures)""" | |
502 # replace figure-tag with figureNumText | |
503 for fig in pagediv.findall(".//span[@class='figure']"): | |
504 txt = fig.findtext(".//span[@class='figureNumText']") | |
505 tail = fig.tail | |
506 fig.clear() | |
507 fig.set('class', 'figure') | |
508 fig.text = txt | |
509 fig.tail = tail | |
510 | |
511 # replace lb-tag with "//" | |
512 for lb in pagediv.findall(".//br[@class='lb']"): | |
513 lb.tag = 'span' | |
514 lb.text = '//' | |
515 | |
516 # replace pb-tag with "///" | |
517 for pb in pagediv.findall(".//span[@class='pb']"): | |
518 tail = pb.tail | |
519 pb.clear() | |
520 pb.set('class', 'pb') | |
521 pb.text = '///' | |
522 pb.tail = tail | |
523 | |
524 return pagediv | |
525 | |
526 def _cleanSearchResult2(self, pagediv): | |
527 """fixes search result html (change pbs and figures)""" | |
528 # unfortunately etree can not select class.startswith('figure') | |
529 divs = pagediv.findall(".//span[@class]") | |
530 for d in divs: | |
531 cls = d.get('class') | |
532 if cls.startswith('figure'): | |
533 # replace figure-tag with figureNumText | |
534 txt = d.findtext(".//span[@class='figureNumText']") | |
535 d.clear() | |
536 d.set('class', 'figure') | |
537 d.text = txt | |
538 | |
539 elif cls.startswith('pb'): | |
540 # replace pb-tag with "//" | |
541 d.clear() | |
542 d.set('class', 'pb') | |
543 d.text = '//' | |
544 | |
545 return pagediv | |
546 | |
547 | |
566 | 548 |
565 | 549 def _fixEmptyDivs(self, pagediv): |
550 """fixes empty div-tags by inserting a space""" | |
551 divs = pagediv.findall('.//div') | |
552 for d in divs: | |
553 if len(d) == 0 and not d.text: | |
554 # make empty divs non-empty | |
555 d.text = ' ' | |
556 | |
557 return pagediv | |
558 | |
559 | |
564 | 560 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): |
561 """loads list of search results and stores XML in docinfo""" | |
583 | 562 normMode = pageinfo.get('characterNormalization', 'reg') |
563 logging.debug("getSearchResults mode=%s query=%s norm=%s"%(mode, query, normMode)) | |
564 | 564 if mode == "none": |
565 return docinfo | |
566 | |
568 | 567 #TODO: put mode into query |
568 | |
564 | 569 cachedQuery = docinfo.get('cachedQuery', None) |
570 if cachedQuery is not None: | |
571 # cached search result | |
583 | 572 if cachedQuery == '%s_%s_%s'%(mode,query,normMode): |
564 | 573 # same query |
574 return docinfo | |
575 | |
576 else: | |
577 # different query | |
578 del docinfo['resultSize'] | |
568 | 579 del docinfo['results'] |
564 | 580 |
581 # cache query | |
583 | 582 docinfo['cachedQuery'] = '%s_%s_%s'%(mode,query,normMode) |
564 | 583 |
584 # fetch full results | |
585 docpath = docinfo['textURLPath'] | |
568 | 586 params = {'docId': docpath, |
564 | 587 'query': query, |
568 | 588 'pageSize': 1000, |
589 'page': 1, | |
590 'outputFormat': 'html'} | |
591 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) | |
592 results = [] | |
593 try: | |
594 dom = ET.fromstring(pagexml) | |
583 | 595 # clean html output |
596 self._processWTags('plain', normMode, dom) | |
597 self._cleanSearchResult(dom) | |
568 | 598 # page content is currently in multiple <td align=left> |
576 | 599 alldivs = dom.findall(".//tr[@class='hit']") |
568 | 600 for div in alldivs: |
576 | 601 # change tr to div |
602 div.tag = 'div' | |
603 # change td to span | |
604 for d in div.findall('td'): | |
605 d.tag = 'span' | |
606 | |
568 | 607 # TODO: can we put etree in the session? |
608 results.append(div) | |
609 | |
610 except Exception, e: | |
611 logging.error("GetSearchResults: Error parsing search result: %s"%e) | |
564 | 612 |
568 | 613 # store results in docinfo |
614 docinfo['resultSize'] = len(results) | |
615 docinfo['results'] = results | |
564 | 616 |
617 return docinfo | |
618 | |
619 | |
620 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): | |
583 | 621 """returns single page from the list of search results""" |
564 | 622 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) |
623 # get (cached) result | |
624 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) | |
625 | |
568 | 626 resultxml = docinfo.get('results', None) |
564 | 627 if not resultxml: |
568 | 628 logging.error("getResultPage: unable to find results") |
564 | 629 return "Error: no result!" |
630 | |
631 if size is None: | |
632 size = pageinfo.get('resultPageSize', 10) | |
633 | |
634 if start is None: | |
635 start = (pn - 1) * size | |
636 | |
576 | 637 if resultxml is not None: |
564 | 638 # paginate |
639 first = start-1 | |
576 | 640 last = first+size |
641 tocdivs = resultxml[first:last] | |
564 | 642 |
576 | 643 toc = ET.Element('div', attrib={'class':'queryResultPage'}) |
644 for div in tocdivs: | |
645 # check all a-tags | |
646 links = div.findall(".//a") | |
647 for l in links: | |
648 href = l.get('href') | |
649 if href: | |
650 # assume all links go to pages | |
651 linkUrl = urlparse.urlparse(href) | |
652 linkParams = urlparse.parse_qs(linkUrl.query) | |
653 # take some parameters (make sure it works even if the link was already parsed) | |
654 params = {'pn': linkParams.get('page',linkParams.get('pn', None)), | |
655 'highlightQuery': linkParams.get('highlightQuery',None), | |
656 'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)), | |
657 'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None)) | |
658 } | |
659 if not params['pn']: | |
660 logging.warn("getResultsPage: link has no page: %s"%href) | |
661 | |
662 url = self.getLink(params=params) | |
663 l.set('href', url) | |
564 | 664 |
576 | 665 toc.append(div) |
666 | |
667 return serialize(toc) | |
564 | 668 |
669 return "ERROR: no results!" | |
670 | |
671 | |
672 def getToc(self, mode='text', docinfo=None): | |
673 """returns list of table of contents from docinfo""" | |
674 logging.debug("getToc mode=%s"%mode) | |
675 if mode == 'text': | |
676 queryType = 'toc' | |
677 else: | |
678 queryType = mode | |
679 | |
680 if not 'full_%s'%queryType in docinfo: | |
681 # get new toc | |
682 docinfo = self.getTextInfo(queryType, docinfo) | |
683 | |
684 return docinfo.get('full_%s'%queryType, []) | |
685 | |
568 | 686 |
564 | 687 def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): |
688 """returns single page from the table of contents""" | |
689 logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) | |
690 fulltoc = self.getToc(mode=mode, docinfo=docinfo) | |
691 if len(fulltoc) < 1: | |
692 logging.error("getTocPage: unable to find toc!") | |
693 return "Error: no table of contents!" | |
694 | |
695 if size is None: | |
696 size = pageinfo.get('tocPageSize', 30) | |
697 | |
698 if start is None: | |
699 start = (pn - 1) * size | |
700 | |
701 # paginate | |
702 first = (start - 1) | |
703 last = first + size | |
704 tocs = fulltoc[first:last] | |
705 tp = '<div>' | |
609 | 706 label = {'figures': 'Figure', 'notes': 'Note', 'handwritten': 'Handwritten note'}.get(mode, 'Item') |
564 | 707 for toc in tocs: |
708 pageurl = self.getLink('pn', toc['pn']) | |
709 tp += '<div class="tocline">' | |
568 | 710 content = toc['content'] |
609 | 711 lvs = toc['level-string'] |
568 | 712 if content: |
609 | 713 tp += '<div class="toc name">[%s] %s</div>'%(lvs, toc['content']) |
714 elif lvs: | |
715 tp += '<div class="toc name">[%s %s]</div>'%(label, lvs) | |
568 | 716 else: |
609 | 717 tp += '<div class="toc name">[%s]</div>'%(label) |
568 | 718 |
719 if toc.get('no', None): | |
720 tp += '<div class="toc page"><a href="%s">Page: %s (%s)</a></div>'%(pageurl, toc['pn'], toc['no']) | |
721 else: | |
722 tp += '<div class="toc page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn']) | |
723 | |
564 | 724 tp += '</div>\n' |
725 | |
726 tp += '</div>\n' | |
727 | |
728 return tp | |
729 | |
730 | |
731 def manage_changeMpiwgXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None): | |
732 """change settings""" | |
733 self.title=title | |
734 self.timeout = timeout | |
735 self.serverUrl = serverUrl | |
736 if repositoryType: | |
737 self.repositoryType = repositoryType | |
738 if RESPONSE is not None: | |
739 RESPONSE.redirect('manage_main') | |
740 | |
741 # management methods | |
742 def manage_addMpiwgXmlTextServerForm(self): | |
743 """Form for adding""" | |
744 pt = PageTemplateFile("zpt/manage_addMpiwgXmlTextServer", globals()).__of__(self) | |
745 return pt() | |
746 | |
747 def manage_addMpiwgXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): | |
577 | 748 """add MpiwgXmlTextServer""" |
564 | 749 newObj = MpiwgXmlTextServer(id=id,title=title,serverUrl=serverUrl,timeout=timeout) |
750 self.Destination()._setObject(id, newObj) | |
751 if RESPONSE is not None: | |
752 RESPONSE.redirect('manage_main') | |
753 | |
610 | 754 |