--- documentViewer/MpdlXmlTextServer.py 2011/07/29 18:36:04 1.238.2.5
+++ documentViewer/MpdlXmlTextServer.py 2011/08/16 16:27:08 1.238.2.14
@@ -1,4 +1,3 @@
-
from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile
@@ -30,12 +29,6 @@ def getTextFromNode(node):
"""get the cdata content of a node"""
if node is None:
return ""
- # ET:
-# text = node.text or ""
-# for e in node:
-# text += gettext(e)
-# if e.tail:
-# text += e.tail
# 4Suite:
nodelist=node.childNodes
@@ -96,7 +89,7 @@ class MpdlXmlTextServer(SimpleItem):
url = docinfo['url']
pagesize = pageinfo['queryPageSize']
pn = pageinfo.get('searchPN',1)
- sn = pageinfo['sn']
+ sn = pageinfo.get('sn',None) #TODO: is this s now?
highlightQuery = pageinfo['highlightQuery']
query =pageinfo['query']
queryType =pageinfo['queryType']
@@ -188,8 +181,6 @@ class MpdlXmlTextServer(SimpleItem):
if not docpath:
return None
- url = docinfo['url']
- selfurl = self.absolute_url()
pn = pageinfo['current']
hrefList=[]
myList= ""
@@ -207,10 +198,6 @@ class MpdlXmlTextServer(SimpleItem):
def getAllGisPlaces (self, docinfo=None, pageinfo=None):
"""Show all Gis Places of whole Book """
xpath ='//echo:place'
- docpath =docinfo['textURLPath']
- url = docinfo['url']
- selfurl =self.absolute_url()
- pn =pageinfo['current']
hrefList=[]
myList=""
text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath))
@@ -227,32 +214,40 @@ class MpdlXmlTextServer(SimpleItem):
def processPageInfo(self, dom, docinfo, pageinfo):
"""processes page info divs from dom and stores in docinfo and pageinfo"""
- # process all toplevel divs
- alldivs = dom.findall(".//div")
- pagediv = None
+ # assume first second level div is pageMeta
+ alldivs = dom.find("div")
+
+ if alldivs is None or alldivs.get('class', '') != 'pageMeta':
+ logging.error("processPageInfo: pageMeta div not found!")
+ return
+
for div in alldivs:
dc = div.get('class')
- # page content div
- if dc == 'pageContent':
- pagediv = div
-
# pageNumberOrig
- elif dc == 'pageNumberOrig':
+ if dc == 'pageNumberOrig':
pageinfo['pageNumberOrig'] = div.text
# pageNumberOrigNorm
elif dc == 'pageNumberOrigNorm':
pageinfo['pageNumberOrigNorm'] = div.text
- # pageNumberOrigNorm
+ # pageHeaderTitle
+ elif dc == 'pageHeaderTitle':
+ pageinfo['pageHeaderTitle'] = div.text
+
+ # numFigureEntries
elif dc == 'countFigureEntries':
- docinfo['countFigureEntries'] = getInt(div.text)
+ docinfo['numFigureEntries'] = getInt(div.text)
- # pageNumberOrigNorm
+ # numTocEntries
elif dc == 'countTocEntries':
# WTF: s1 = int(s)/30+1
- docinfo['countTocEntries'] = getInt(div.text)
+ docinfo['numTocEntries'] = getInt(div.text)
+
+ # numPlaces
+ elif dc == 'countPlaces':
+ docinfo['numPlaces'] = getInt(div.text)
# numTextPages
elif dc == 'countPages':
@@ -262,18 +257,19 @@ class MpdlXmlTextServer(SimpleItem):
if docinfo.get('numPages', 0) == 0:
# seems to be text-only - update page count
docinfo['numPages'] = np
- pageinfo['end'] = min(pageinfo['end'], np)
+ #pageinfo['end'] = min(pageinfo['end'], np)
pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
if np % pageinfo['groupsize'] > 0:
pageinfo['numgroups'] += 1
-
+
+ #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
return
- def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None):
+ def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
"""returns single page from fulltext"""
logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
- # check for cached text -- but this shouldn't be called twice
+ # check for cached text -- but ideally this shouldn't be called twice
if pageinfo.has_key('textPage'):
logging.debug("getTextPage: using cached text")
return pageinfo['textPage']
@@ -291,11 +287,16 @@ class MpdlXmlTextServer(SimpleItem):
tocMode = pageinfo.get('tocMode', None)
tocPN = pageinfo.get('tocPN',None)
characterNormalization = pageinfo.get('characterNormalization', None)
- selfurl = docinfo['viewerUrl']
- if mode == "text_dict":
- # text_dict is called textPollux in the backend
+ selfurl = docinfo['viewerUrl']
+
+ if mode == "dict" or mode == "text_dict":
+ # dict is called textPollux in the backend
textmode = "textPollux"
+ elif not mode:
+ # default is text
+ mode = "text"
+ textmode = "text"
else:
textmode = mode
@@ -311,7 +312,8 @@ class MpdlXmlTextServer(SimpleItem):
# page content is in
pagediv = None
# ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
- alldivs = dom.findall(".//div")
+ # so we look at the second level divs
+ alldivs = dom.findall("div")
for div in alldivs:
dc = div.get('class')
# page content div
@@ -321,19 +323,23 @@ class MpdlXmlTextServer(SimpleItem):
# plain text mode
if mode == "text":
- if pagediv:
+ # get full url assuming documentViewer is parent
+ selfurl = self.getLink()
+ if pagediv is not None:
links = pagediv.findall(".//a")
for l in links:
href = l.get('href')
if href and href.startswith('#note-'):
- href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))
+ href = href.replace('#note-',"%s#note-"%selfurl)
l.set('href', href)
return serialize(pagediv)
# text-with-links mode
- elif mode == "text_dict":
- if pagediv:
+ elif mode == "dict":
+ if pagediv is not None:
+ viewerurl = docinfo['viewerUrl']
+ selfurl = self.getLink()
# check all a-tags
links = pagediv.findall(".//a")
for l in links:
@@ -342,14 +348,13 @@ class MpdlXmlTextServer(SimpleItem):
if href:
# is link with href
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'):
- # is pollux link
- selfurl = self.absolute_url()
- # change href
- l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl))
- # add target
+ # is dictionary link - change href (keeping parameters)
+ l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
+ # add target to open new page
l.set('target', '_blank')
- if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
+ # TODO: is this needed?
+ if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
selfurl = self.absolute_url()
l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
l.set('target', '_blank')
@@ -357,24 +362,25 @@ class MpdlXmlTextServer(SimpleItem):
l.set('ondblclick', 'popupWin.focus();')
if href.startswith('#note-'):
- l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)))
+ # note link
+ l.set('href', href.replace('#note-',"%s#note-"%selfurl))
return serialize(pagediv)
# xml mode
elif mode == "xml":
- if pagediv:
+ if pagediv is not None:
return serialize(pagediv)
# pureXml mode
elif mode == "pureXml":
- if pagediv:
+ if pagediv is not None:
return serialize(pagediv)
# gis mode
elif mode == "gis":
name = docinfo['name']
- if pagediv:
+ if pagediv is not None:
# check all a-tags
links = pagediv.findall(".//a")
for l in links:
@@ -388,28 +394,10 @@ class MpdlXmlTextServer(SimpleItem):
return "no text here"
- # WTF: is this needed?
- def getOrigPages(self, docinfo=None, pageinfo=None):
- logging.debug("CALLED: getOrigPages!")
- if not pageinfo.has_key('pageNumberOrig'):
- logging.warning("getOrigPages: not in pageinfo!")
- return None
-
- return pageinfo['pageNumberOrig']
-
- # WTF: is this needed?
- def getOrigPagesNorm(self, docinfo=None, pageinfo=None):
- logging.debug("CALLED: getOrigPagesNorm!")
- if not pageinfo.has_key('pageNumberOrigNorm'):
- logging.warning("getOrigPagesNorm: not in pageinfo!")
- return None
-
- return pageinfo['pageNumberOrigNorm']
-
# TODO: should be getWordInfo
- def getTranslate(self, word=None, language=None):
- """translate into another languages"""
- data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html")
+ def getWordInfo(self, word='', language='', display=''):
+ """show information (like dictionaries) about word"""
+ data = self.getServerData("lt/wordInfo.xql","language=%s&word=%s&display=%s&output=html"%(language,urllib.quote(word),urllib.quote(display)))
return data
# WTF: what does this do?
@@ -491,7 +479,7 @@ class MpdlXmlTextServer(SimpleItem):
return docinfo
- def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None):
+ def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
"""returns single page from the table of contents"""
logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
if mode == "text":
@@ -506,24 +494,21 @@ class MpdlXmlTextServer(SimpleItem):
tocxml = docinfo.get('tocXML_%s'%mode, None)
if not tocxml:
logging.error("getTocPage: unable to find tocXML")
- return "No ToC"
+ return "Error: no table of contents!"
- pagesize = int(pageinfo['tocPageSize'])
- url = docinfo['url']
- urlmode = docinfo['mode']
- selfurl = docinfo['viewerUrl']
- viewMode= pageinfo['viewMode']
- tocMode = pageinfo['tocMode']
- tocPN = int(pageinfo['tocPN'])
- pn = tocPN
+ if size is None:
+ size = pageinfo.get('tocPageSize', 30)
+
+ if start is None:
+ start = (pn - 1) * size
fulltoc = ET.fromstring(tocxml)
if fulltoc:
# paginate
- start = (pn - 1) * pagesize * 2
- len = pagesize * 2
- del fulltoc[:start]
+ first = (start - 1) * 2
+ len = size * 2
+ del fulltoc[:first]
del fulltoc[len:]
tocdivs = fulltoc
@@ -535,12 +520,23 @@ class MpdlXmlTextServer(SimpleItem):
# take pn from href
m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
if m is not None:
- # and create new url
- l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN))
+ # and create new url (assuming parent is documentViewer)
+ url = self.getLink('pn', m.group(1))
+ l.set('href', url)
else:
logging.warning("getTocPage: Problem with link=%s"%href)
- return serialize(tocdivs)
+ # fix two-divs-per-row with containing div
+ newtoc = ET.Element('div', {'class':'queryResultPage'})
+ for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
+ e = ET.Element('div',{'class':'tocline'})
+ e.append(d1)
+ e.append(d2)
+ newtoc.append(e)
+
+ return serialize(newtoc)
+
+ return "ERROR: no table of contents!"
def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):