comparison MpdlXmlTextServer.py @ 455:0a53fea83df7 elementtree

more work renovating
author casties
date Fri, 15 Jul 2011 21:34:41 +0200
parents beb7ccb92564
children b27a7d2f06ff
comparison
equal deleted inserted replaced
454:73e3273c7624 455:0a53fea83df7
1 1
2 from OFS.SimpleItem import SimpleItem 2 from OFS.SimpleItem import SimpleItem
3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile 3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
4
4 from Ft.Xml import EMPTY_NAMESPACE, Parse 5 from Ft.Xml import EMPTY_NAMESPACE, Parse
5 from Ft.Xml.Domlette import NonvalidatingReader 6 from Ft.Xml.Domlette import NonvalidatingReader
6 import Ft.Xml.Domlette 7 import Ft.Xml.Domlette
7 import cStringIO 8 import cStringIO
8 9
9 import xml.etree.ElementTree as ET 10 import xml.etree.ElementTree as ET
10 11
11 import md5 12 import re
12 import sys
13 import logging 13 import logging
14 import urllib 14 import urllib
15 import documentViewer 15 import documentViewer
16 #from documentViewer import getTextFromNode, serializeNode 16 #from documentViewer import getTextFromNode, serializeNode
17
18 def intOr0(s, default=0):
19 """convert s to int or return default"""
20 try:
21 return int(s)
22 except:
23 return default
17 24
18 def getText(node): 25 def getText(node):
19 """get the cdata content of a node""" 26 """get the cdata content of a node"""
20 if node is None: 27 if node is None:
21 return "" 28 return ""
42 def getTextFromNode(node): 49 def getTextFromNode(node):
43 """get the cdata content of a node""" 50 """get the cdata content of a node"""
44 if node is None: 51 if node is None:
45 return "" 52 return ""
46 # ET: 53 # ET:
47 #text = node.text or "" 54 # text = node.text or ""
48 #for e in node: 55 # for e in node:
49 # text += gettext(e) 56 # text += gettext(e)
50 # if e.tail: 57 # if e.tail:
51 # text += e.tail 58 # text += e.tail
52 59
53 # 4Suite: 60 # 4Suite:
54 nodelist=node.childNodes 61 nodelist=node.childNodes
55 text = "" 62 text = ""
56 for n in nodelist: 63 for n in nodelist:
80 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, 87 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
81 )+SimpleItem.manage_options 88 )+SimpleItem.manage_options
82 89
83 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) 90 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
84 91
85 def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 92 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
86 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40):
87 93
88 """constructor""" 94 """constructor"""
89 self.id=id 95 self.id=id
90 self.title=title 96 self.title=title
91 self.timeout = timeout 97 self.timeout = timeout
101 def getServerData(self, method, data=None): 107 def getServerData(self, method, data=None):
102 """returns result from text server for method+data""" 108 """returns result from text server for method+data"""
103 url = self.serverUrl+method 109 url = self.serverUrl+method
104 return documentViewer.getHttpData(url,data,timeout=self.timeout) 110 return documentViewer.getHttpData(url,data,timeout=self.timeout)
105 111
112 # WTF: what does this really do? can it be integrated in getPage?
106 def getSearch(self, pageinfo=None, docinfo=None): 113 def getSearch(self, pageinfo=None, docinfo=None):
107 """get search list""" 114 """get search list"""
115 logging.debug("getSearch()")
108 docpath = docinfo['textURLPath'] 116 docpath = docinfo['textURLPath']
109 url = docinfo['url'] 117 url = docinfo['url']
110 pagesize = pageinfo['queryPageSize'] 118 pagesize = pageinfo['queryPageSize']
111 pn = pageinfo.get('searchPN',1) 119 pn = pageinfo.get('searchPN',1)
112 sn = pageinfo['sn'] 120 sn = pageinfo['sn']
205 selfurl = self.absolute_url() 213 selfurl = self.absolute_url()
206 pn = pageinfo['current'] 214 pn = pageinfo['current']
207 hrefList=[] 215 hrefList=[]
208 myList= "" 216 myList= ""
209 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) 217 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn))
210 dom = Parse(text) 218 dom = ET.fromstring(text)
211 result = dom.xpath("//result/resultPage/place") 219 result = dom.findall(".//result/resultPage/place")
212 for l in result: 220 for l in result:
213 hrefNode= l.getAttributeNodeNS(None, u"id") 221 href = l.get("id")
214 href= hrefNode.nodeValue
215 hrefList.append(href) 222 hrefList.append(href)
223 # WTF: what does this do?
216 myList = ",".join(hrefList) 224 myList = ",".join(hrefList)
217 #logging.debug("getGisPlaces :%s"%(myList)) 225 #logging.debug("getGisPlaces :%s"%(myList))
218 return myList 226 return myList
219 227
220 def getAllGisPlaces (self, docinfo=None, pageinfo=None): 228 def getAllGisPlaces (self, docinfo=None, pageinfo=None):
225 selfurl =self.absolute_url() 233 selfurl =self.absolute_url()
226 pn =pageinfo['current'] 234 pn =pageinfo['current']
227 hrefList=[] 235 hrefList=[]
228 myList="" 236 myList=""
229 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) 237 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath))
230 dom =Parse(text) 238 dom = ET.fromstring(text)
231 result = dom.xpath("//result/resultPage/place") 239 result = dom.findall(".//result/resultPage/place")
232 240
233 for l in result: 241 for l in result:
234 hrefNode = l.getAttributeNodeNS(None, u"id") 242 href = l.get("id")
235 href= hrefNode.nodeValue
236 hrefList.append(href) 243 hrefList.append(href)
244 # WTF: what does this do?
237 myList = ",".join(hrefList) 245 myList = ",".join(hrefList)
238 #logging.debug("getALLGisPlaces :%s"%(myList)) 246 #logging.debug("getALLGisPlaces :%s"%(myList))
239 return myList 247 return myList
240 248
241 249 def processPageInfo(self, dom, docinfo, pageinfo):
242 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): 250 """processes page info divs from dom and stores in docinfo and pageinfo"""
243 """returns single page from fulltext""" 251 # process all toplevel divs
244 docpath = docinfo['textURLPath']
245 path = docinfo['textURLPath']
246 url = docinfo.get('url',None)
247 name = docinfo.get('name',None)
248 pn =pageinfo['current']
249 sn = pageinfo['sn']
250 #optionToggle =pageinfo ['optionToggle']
251 highlightQuery = pageinfo['highlightQuery']
252 #mode = pageinfo ['viewMode']
253 tocMode = pageinfo['tocMode']
254 characterNormalization=pageinfo['characterNormalization']
255 tocPN = pageinfo['tocPN']
256 selfurl = self.absolute_url()
257 if mode == "text_dict":
258 textmode = "textPollux"
259 else:
260 textmode = mode
261
262 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization)
263 if highlightQuery is not None:
264 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn)
265
266 pagexml = self.getServerData("page-fragment.xql",textParam)
267 dom = ET.fromstring(pagexml)
268 #dom = NonvalidatingReader.parseStream(pagexml)
269
270 #original Pages
271 #pagedivs = dom.xpath("//div[@class='pageNumberOrig']")
272
273 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"):
274 if len(pagedivs)>0:
275 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0])
276 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig']))
277
278 #original Pages Norm
279 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']")
280 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"):
281 if len(pagedivs)>0:
282 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0])
283 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm']))
284 """
285 #figureEntries
286 # pagedivs = dom.xpath("//div[@class='countFigureEntries']")
287 # if pagedivs == dom.xpath("//div[@class='countFigureEntries']"):
288 # if len(pagedivs)>0:
289 # docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0])
290 # s = getTextFromNode(pagedivs[0])
291 # if s=='0':
292 # try:
293 # docinfo['countFigureEntries'] = int(s)
294 # except:
295 # docinfo['countFigureEntries'] = 0
296 # else:
297 # s1 = int(s)/30+1
298 # try:
299 # docinfo['countFigureEntries'] = int(s1)
300 # except:
301 # docinfo['countFigureEntries'] = 0
302 #
303 # #allPlaces
304 # pagedivs = dom.xpath("//div[@class='countPlaces']")
305 # if pagedivs == dom.xpath("//div[@class='countPlaces']"):
306 # if len(pagedivs)>0:
307 # docinfo['countPlaces']= getTextFromNode(pagedivs[0])
308 # s = getTextFromNode(pagedivs[0])
309 # try:
310 # docinfo['countPlaces'] = int(s)
311 # except:
312 # docinfo['countPlaces'] = 0
313 #
314 # #tocEntries
315 # pagedivs = dom.xpath("//div[@class='countTocEntries']")
316 # if pagedivs == dom.xpath("//div[@class='countTocEntries']"):
317 # if len(pagedivs)>0:
318 # docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0]))
319 # s = getTextFromNode(pagedivs[0])
320 # if s=='0':
321 # try:
322 # docinfo['countTocEntries'] = int(s)
323 # except:
324 # docinfo['countTocEntries'] = 0
325 # else:
326 # s1 = int(s)/30+1
327 # try:
328 # docinfo['countTocEntries'] = int(s1)
329 # except:
330 # docinfo['countTocEntries'] = 0
331
332 #numTextPages
333 #pagedivs = dom.xpath("//div[@class='countPages']")
334 alldivs = dom.findall(".//div") 252 alldivs = dom.findall(".//div")
335 pagediv = None 253 pagediv = None
336 for div in alldivs: 254 for div in alldivs:
337 dc = div.get('class') 255 dc = div.get('class')
256
257 # page content div
338 if dc == 'pageContent': 258 if dc == 'pageContent':
339 pagediv = div 259 pagediv = div
260
261 # pageNumberOrig
262 elif dc == 'pageNumberOrig':
263 pageinfo['pageNumberOrig'] = div.text
340 264
341 if dc == 'countPages': 265 # pageNumberOrigNorm
342 try: 266 elif dc == 'pageNumberOrigNorm':
343 np = int(div.text) 267 pageinfo['pageNumberOrigNorm'] = div.text
344 docinfo['numPages'] = np 268
345 pageinfo['end'] = min(pageinfo['end'], np) 269 # pageNumberOrigNorm
346 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) 270 elif dc == 'countFigureEntries':
347 if np % pageinfo['groupsize'] > 0: 271 docinfo['countFigureEntries'] = intOr0(div.text)
348 pageinfo['numgroups'] += 1 272
349 273 # pageNumberOrigNorm
350 except: 274 elif dc == 'countTocEntries':
351 docinfo['numPages'] = 0 275 # WTF: s1 = int(s)/30+1
352 276 docinfo['countTocEntries'] = intOr0(div.text)
277
278 # numTextPages
279 elif dc == 'countPages':
280 np = intOr0(div.text)
281 if np > 0:
282 docinfo['numTextPages'] = np
283 if docinfo.get('numPages', 0) == 0:
284 # seems to be text-only
285 docinfo['numTextPages'] = np
286 pageinfo['end'] = min(pageinfo['end'], np)
287 pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
288 if np % pageinfo['groupsize'] > 0:
289 pageinfo['numgroups'] += 1
290
291 return
292
293
294 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None):
295 """returns single page from fulltext"""
296 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
297 # check for cached text -- but this shouldn't be called twice
298 if pageinfo.has_key('textPage'):
299 logging.debug("getTextPage: using cached text")
300 return pageinfo['textPage']
301
302 docpath = docinfo['textURLPath']
303 # just checking
304 if pageinfo['current'] != pn:
305 logging.warning("getTextPage: current!=pn!")
306
307 # stuff for constructing full urls
308 url = docinfo['url']
309 urlmode = docinfo['mode']
310 sn = pageinfo.get('sn', None)
311 highlightQuery = pageinfo.get('highlightQuery', None)
312 tocMode = pageinfo.get('tocMode', None)
313 tocPN = pageinfo.get('tocPN',None)
314 characterNormalization = pageinfo.get('characterNormalization', None)
315 selfurl = docinfo['viewerUrl']
316
317 if mode == "text_dict":
318 # text_dict is called textPollux in the backend
319 textmode = "textPollux"
320 else:
321 textmode = mode
322
323 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization)
324 if highlightQuery:
325 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn)
326
327 # fetch the page
328 pagexml = self.getServerData("page-fragment.xql",textParam)
329 dom = ET.fromstring(pagexml)
330 # extract additional info
331 self.processPageInfo(dom, docinfo, pageinfo)
332 # page content is in <div class="pageContent">
333 pagediv = None
334 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
335 alldivs = dom.findall(".//div")
336 for div in alldivs:
337 dc = div.get('class')
338 # page content div
339 if dc == 'pageContent':
340 pagediv = div
353 break 341 break
354
355 # ROC: why?
356 # else:
357 # #no full text -- init to 0
358 # docinfo['pageNumberOrig'] = 0
359 # docinfo['countFigureEntries'] = 0
360 # docinfo['countPlaces'] = 0
361 # docinfo['countTocEntries'] = 0
362 # docinfo['numPages'] = 0
363 # docinfo['pageNumberOrigNorm'] = 0
364 # #return docinfo
365 342
366 # plain text mode 343 # plain text mode
367 if mode == "text": 344 if mode == "text":
368 #pagedivs = dom.xpath("/div")
369 if pagediv: 345 if pagediv:
370 links = pagediv.findall(".//a") 346 links = pagediv.findall(".//a")
371 for l in links: 347 for l in links:
372 href = l.get('href') 348 href = l.get('href')
373 if href and href.startswith('#note-'): 349 if href and href.startswith('#note-'):
374 href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) 350 href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))
375 l.set('href', href) 351 l.set('href', href)
376 logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) 352
377 return serialize(pagediv) 353 return serialize(pagediv)
378 354
379 if mode == "xml":
380 if pagediv:
381 return serialize(pagediv)
382
383 if mode == "pureXml":
384 if pagediv:
385 return serialize(pagediv)
386
387 if mode == "gis":
388 if pagediv:
389 # check all a-tags
390 links = pagediv.findall(".//a")
391 for l in links:
392 href = l.get('href')
393 if href:
394 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'):
395 l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name))
396 l.set('target', '_blank')
397
398 return serialize(pagenode)
399
400 # text-with-links mode 355 # text-with-links mode
401 if mode == "text_dict": 356 elif mode == "text_dict":
402 if pagediv: 357 if pagediv:
403 # check all a-tags 358 # check all a-tags
404 links = pagediv.findall(".//a") 359 links = pagediv.findall(".//a")
405 for l in links: 360 for l in links:
406 href = l.get('href') 361 href = l.get('href')
421 l.set('target', '_blank') 376 l.set('target', '_blank')
422 l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") 377 l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
423 l.set('ondblclick', 'popupWin.focus();') 378 l.set('ondblclick', 'popupWin.focus();')
424 379
425 if href.startswith('#note-'): 380 if href.startswith('#note-'):
426 l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) 381 l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)))
427 382
428 return serialize(pagediv) 383 return serialize(pagediv)
429 384
385 # xml mode
386 elif mode == "xml":
387 if pagediv:
388 return serialize(pagediv)
389
390 # pureXml mode
391 elif mode == "pureXml":
392 if pagediv:
393 return serialize(pagediv)
394
395 # gis mode
396 elif mode == "gis":
397 name = docinfo['name']
398 if pagediv:
399 # check all a-tags
400 links = pagediv.findall(".//a")
401 for l in links:
402 href = l.get('href')
403 if href:
404 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'):
405 l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name))
406 l.set('target', '_blank')
407
408 return serialize(pagediv)
409
430 return "no text here" 410 return "no text here"
431 411
412 # WTF: is this needed?
432 def getOrigPages(self, docinfo=None, pageinfo=None): 413 def getOrigPages(self, docinfo=None, pageinfo=None):
433 docpath = docinfo['textURLPath'] 414 logging.debug("CALLED: getOrigPages!")
434 pn =pageinfo['current'] 415 if not pageinfo.has_key('pageNumberOrig'):
435 selfurl = self.absolute_url() 416 logging.warning("getOrigPages: not in pageinfo!")
436 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 417 return None
437 dom = Parse(pagexml) 418
438 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") 419 return pageinfo['pageNumberOrig']
439 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): 420
440 if len(pagedivs)>0: 421 # WTF: is this needed?
441 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0])
442 return docinfo['pageNumberOrig']
443
444 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): 422 def getOrigPagesNorm(self, docinfo=None, pageinfo=None):
445 docpath = docinfo['textURLPath'] 423 logging.debug("CALLED: getOrigPagesNorm!")
446 pn =pageinfo['current'] 424 if not pageinfo.has_key('pageNumberOrigNorm'):
447 selfurl = self.absolute_url() 425 logging.warning("getOrigPagesNorm: not in pageinfo!")
448 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 426 return None
449 dom = Parse(pagexml) 427
450 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") 428 return pageinfo['pageNumberOrigNorm']
451 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"):
452 if len(pagedivs)>0:
453 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0])
454 return docinfo['pageNumberOrigNorm']
455
456 429
430 # TODO: should be getWordInfo
457 def getTranslate(self, word=None, language=None): 431 def getTranslate(self, word=None, language=None):
458 """translate into another languages""" 432 """translate into another languages"""
459 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") 433 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html")
460 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query)))
461 return data 434 return data
462 435
436 # WTF: what does this do?
463 def getLemma(self, lemma=None, language=None): 437 def getLemma(self, lemma=None, language=None):
464 """simular words lemma """ 438 """simular words lemma """
465 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") 439 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html")
466 return data 440 return data
467 441
442 # WTF: what does this do?
468 def getLemmaQuery(self, query=None, language=None): 443 def getLemmaQuery(self, query=None, language=None):
469 """simular words lemma """ 444 """simular words lemma """
470 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") 445 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html")
471 return data 446 return data
472 447
448 # WTF: what does this do?
473 def getLex(self, query=None, language=None): 449 def getLex(self, query=None, language=None):
474 #simular words lemma 450 #simular words lemma
475 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) 451 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query))
476 return data 452 return data
477 453
454 # WTF: what does this do?
478 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): 455 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1):
479 #number of 456 #number of
480 docpath = docinfo['textURLPath'] 457 docpath = docinfo['textURLPath']
481 pagesize = pageinfo['queryPageSize'] 458 pagesize = pageinfo['queryPageSize']
482 pn = pageinfo['searchPN'] 459 pn = pageinfo['searchPN']
491 tocSearch = int(getTextFromNode(numdivs[0])) 468 tocSearch = int(getTextFromNode(numdivs[0]))
492 tc=int((tocSearch/10)+1) 469 tc=int((tocSearch/10)+1)
493 return tc 470 return tc
494 471
495 def getToc(self, mode="text", docinfo=None): 472 def getToc(self, mode="text", docinfo=None):
496 """loads table of contents and stores in docinfo""" 473 """loads table of contents and stores XML in docinfo"""
474 logging.debug("getToc mode=%s"%mode)
497 if mode == "none": 475 if mode == "none":
498 return docinfo 476 return docinfo
477
499 if 'tocSize_%s'%mode in docinfo: 478 if 'tocSize_%s'%mode in docinfo:
500 # cached toc 479 # cached toc
501 return docinfo 480 return docinfo
502 481
503 docpath = docinfo['textURLPath'] 482 docpath = docinfo['textURLPath']
509 else: 488 else:
510 queryType = mode 489 queryType = mode
511 # number of entries in toc 490 # number of entries in toc
512 tocSize = 0 491 tocSize = 0
513 tocDiv = None 492 tocDiv = None
514 493 # fetch full toc
515 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 494 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
516 495 dom = ET.fromstring(pagexml)
517 # post-processing downloaded xml 496 # page content is in <div class="queryResultPage">
518 pagedom = Parse(pagexml) 497 pagediv = None
519 # get number of entries 498 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
520 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 499 alldivs = dom.findall("div")
521 if len(numdivs) > 0: 500 for div in alldivs:
522 tocSize = int(getTextFromNode(numdivs[0])) 501 dc = div.get('class')
523 docinfo['tocSize_%s'%mode] = tocSize 502 # page content div
503 if dc == 'queryResultPage':
504 pagediv = div
505
506 elif dc == 'queryResultHits':
507 docinfo['tocSize_%s'%mode] = intOr0(div.text)
508
509 if pagediv:
510 # # split xml in chunks
511 # tocs = []
512 # tocdivs = pagediv.findall('div')
513 # for p in zip(tocdivs[::2], tocdivs[1::2]):
514 # toc = serialize(p[0])
515 # toc += serialize(p[1])
516 # tocs.append(toc)
517 # logging.debug("pair: %s"%(toc))
518 # store XML in docinfo
519 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
520
524 return docinfo 521 return docinfo
525 522
526 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): 523 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None):
527 """returns single page from the table of contents""" 524 """returns single page from the table of contents"""
528 # TODO: this should use the cached TOC 525 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
529 if mode == "text": 526 if mode == "text":
530 queryType = "toc" 527 queryType = "toc"
531 else: 528 else:
532 queryType = mode 529 queryType = mode
533 docpath = docinfo['textURLPath'] 530
534 path = docinfo['textURLPath'] 531 # check for cached TOC
535 pagesize = pageinfo['tocPageSize'] 532 if not docinfo.has_key('tocXML_%s'%mode):
536 pn = pageinfo['tocPN'] 533 self.getToc(mode=mode, docinfo=docinfo)
534
535 tocxml = docinfo.get('tocXML_%s'%mode, None)
536 if not tocxml:
537 logging.error("getTocPage: unable to find tocXML")
538 return "No ToC"
539
540 pagesize = int(pageinfo['tocPageSize'])
537 url = docinfo['url'] 541 url = docinfo['url']
538 selfurl = self.absolute_url() 542 urlmode = docinfo['mode']
543 selfurl = docinfo['viewerUrl']
539 viewMode= pageinfo['viewMode'] 544 viewMode= pageinfo['viewMode']
540 characterNormalization = pageinfo ['characterNormalization']
541 #optionToggle =pageinfo ['optionToggle']
542 tocMode = pageinfo['tocMode'] 545 tocMode = pageinfo['tocMode']
543 tocPN = pageinfo['tocPN'] 546 tocPN = int(pageinfo['tocPN'])
544 547
545 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) 548 fulltoc = ET.fromstring(tocxml)
546 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) 549
547 text = page.replace('mode=image','mode=texttool') 550 if fulltoc:
548 return text 551 # paginate
552 #start = (pn - 1) * pagesize * 2
553 #end = start + pagesize * 2
554 #tocdivs = fulltoc[start:end]
555 tocdivs = fulltoc
556
557 # check all a-tags
558 links = tocdivs.findall(".//a")
559 for l in links:
560 href = l.get('href')
561 if href:
562 # take pn from href
563 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
564 if m is not None:
565 # and create new url
566 l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN))
567 else:
568 logging.warning("getTocPage: Problem with link=%s"%href)
569
570 return serialize(tocdivs)
571
549 572
550 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 573 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
551 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
552 """change settings""" 574 """change settings"""
553 self.title=title 575 self.title=title
554 self.timeout = timeout 576 self.timeout = timeout
555 self.serverUrl = serverUrl 577 self.serverUrl = serverUrl
556 if RESPONSE is not None: 578 if RESPONSE is not None:
567 """add zogiimage""" 589 """add zogiimage"""
568 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) 590 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
569 self.Destination()._setObject(id, newObj) 591 self.Destination()._setObject(id, newObj)
570 if RESPONSE is not None: 592 if RESPONSE is not None:
571 RESPONSE.redirect('manage_main') 593 RESPONSE.redirect('manage_main')
594
595