--- documentViewer/MpdlXmlTextServer.py 2011/07/15 19:34:41 1.238.2.2
+++ documentViewer/MpdlXmlTextServer.py 2011/08/16 16:27:08 1.238.2.14
@@ -1,4 +1,3 @@
-
from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile
@@ -12,28 +11,8 @@ import xml.etree.ElementTree as ET
import re
import logging
import urllib
-import documentViewer
-#from documentViewer import getTextFromNode, serializeNode
-
-def intOr0(s, default=0):
- """convert s to int or return default"""
- try:
- return int(s)
- except:
- return default
-def getText(node):
- """get the cdata content of a node"""
- if node is None:
- return ""
- # ET:
- text = node.text or ""
- for e in node:
- text += gettext(e)
- if e.tail:
- text += e.tail
-
- return text
+from SrvTxtUtils import getInt, getText, getHttpData
def serialize(node):
"""returns a string containing an XML snippet of node"""
@@ -50,12 +29,6 @@ def getTextFromNode(node):
"""get the cdata content of a node"""
if node is None:
return ""
- # ET:
-# text = node.text or ""
-# for e in node:
-# text += gettext(e)
-# if e.tail:
-# text += e.tail
# 4Suite:
nodelist=node.childNodes
@@ -90,7 +63,6 @@ class MpdlXmlTextServer(SimpleItem):
manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
-
"""constructor"""
self.id=id
self.title=title
@@ -102,12 +74,12 @@ class MpdlXmlTextServer(SimpleItem):
def getHttpData(self, url, data=None):
"""returns result from url+data HTTP request"""
- return documentViewer.getHttpData(url,data,timeout=self.timeout)
+ return getHttpData(url,data,timeout=self.timeout)
def getServerData(self, method, data=None):
"""returns result from text server for method+data"""
url = self.serverUrl+method
- return documentViewer.getHttpData(url,data,timeout=self.timeout)
+ return getHttpData(url,data,timeout=self.timeout)
# WTF: what does this really do? can it be integrated in getPage?
def getSearch(self, pageinfo=None, docinfo=None):
@@ -117,7 +89,7 @@ class MpdlXmlTextServer(SimpleItem):
url = docinfo['url']
pagesize = pageinfo['queryPageSize']
pn = pageinfo.get('searchPN',1)
- sn = pageinfo['sn']
+ sn = pageinfo.get('sn',None) #TODO: is this s now?
highlightQuery = pageinfo['highlightQuery']
query =pageinfo['query']
queryType =pageinfo['queryType']
@@ -209,8 +181,6 @@ class MpdlXmlTextServer(SimpleItem):
if not docpath:
return None
- url = docinfo['url']
- selfurl = self.absolute_url()
pn = pageinfo['current']
hrefList=[]
myList= ""
@@ -228,10 +198,6 @@ class MpdlXmlTextServer(SimpleItem):
def getAllGisPlaces (self, docinfo=None, pageinfo=None):
"""Show all Gis Places of whole Book """
xpath ='//echo:place'
- docpath =docinfo['textURLPath']
- url = docinfo['url']
- selfurl =self.absolute_url()
- pn =pageinfo['current']
hrefList=[]
myList=""
text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath))
@@ -248,53 +214,62 @@ class MpdlXmlTextServer(SimpleItem):
def processPageInfo(self, dom, docinfo, pageinfo):
"""processes page info divs from dom and stores in docinfo and pageinfo"""
- # process all toplevel divs
- alldivs = dom.findall(".//div")
- pagediv = None
+ # assume first second level div is pageMeta
+ alldivs = dom.find("div")
+
+ if alldivs is None or alldivs.get('class', '') != 'pageMeta':
+ logging.error("processPageInfo: pageMeta div not found!")
+ return
+
for div in alldivs:
dc = div.get('class')
- # page content div
- if dc == 'pageContent':
- pagediv = div
-
# pageNumberOrig
- elif dc == 'pageNumberOrig':
+ if dc == 'pageNumberOrig':
pageinfo['pageNumberOrig'] = div.text
# pageNumberOrigNorm
elif dc == 'pageNumberOrigNorm':
pageinfo['pageNumberOrigNorm'] = div.text
- # pageNumberOrigNorm
+ # pageHeaderTitle
+ elif dc == 'pageHeaderTitle':
+ pageinfo['pageHeaderTitle'] = div.text
+
+ # numFigureEntries
elif dc == 'countFigureEntries':
- docinfo['countFigureEntries'] = intOr0(div.text)
+ docinfo['numFigureEntries'] = getInt(div.text)
- # pageNumberOrigNorm
+ # numTocEntries
elif dc == 'countTocEntries':
# WTF: s1 = int(s)/30+1
- docinfo['countTocEntries'] = intOr0(div.text)
+ docinfo['numTocEntries'] = getInt(div.text)
+
+ # numPlaces
+ elif dc == 'countPlaces':
+ docinfo['numPlaces'] = getInt(div.text)
# numTextPages
elif dc == 'countPages':
- np = intOr0(div.text)
+ np = getInt(div.text)
if np > 0:
docinfo['numTextPages'] = np
if docinfo.get('numPages', 0) == 0:
- # seems to be text-only
- docinfo['numTextPages'] = np
- pageinfo['end'] = min(pageinfo['end'], np)
+ # seems to be text-only - update page count
+ docinfo['numPages'] = np
+ #pageinfo['end'] = min(pageinfo['end'], np)
pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
if np % pageinfo['groupsize'] > 0:
pageinfo['numgroups'] += 1
-
+
+ #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
return
- def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None):
+ def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
"""returns single page from fulltext"""
logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
- # check for cached text -- but this shouldn't be called twice
+ # check for cached text -- but ideally this shouldn't be called twice
if pageinfo.has_key('textPage'):
logging.debug("getTextPage: using cached text")
return pageinfo['textPage']
@@ -312,11 +287,16 @@ class MpdlXmlTextServer(SimpleItem):
tocMode = pageinfo.get('tocMode', None)
tocPN = pageinfo.get('tocPN',None)
characterNormalization = pageinfo.get('characterNormalization', None)
- selfurl = docinfo['viewerUrl']
- if mode == "text_dict":
- # text_dict is called textPollux in the backend
+ selfurl = docinfo['viewerUrl']
+
+ if mode == "dict" or mode == "text_dict":
+ # dict is called textPollux in the backend
textmode = "textPollux"
+ elif not mode:
+ # default is text
+ mode = "text"
+ textmode = "text"
else:
textmode = mode
@@ -332,7 +312,8 @@ class MpdlXmlTextServer(SimpleItem):
# page content is in
pagediv = None
# ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
- alldivs = dom.findall(".//div")
+ # so we look at the second level divs
+ alldivs = dom.findall("div")
for div in alldivs:
dc = div.get('class')
# page content div
@@ -342,19 +323,23 @@ class MpdlXmlTextServer(SimpleItem):
# plain text mode
if mode == "text":
- if pagediv:
+ # get full url assuming documentViewer is parent
+ selfurl = self.getLink()
+ if pagediv is not None:
links = pagediv.findall(".//a")
for l in links:
href = l.get('href')
if href and href.startswith('#note-'):
- href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))
+ href = href.replace('#note-',"%s#note-"%selfurl)
l.set('href', href)
return serialize(pagediv)
# text-with-links mode
- elif mode == "text_dict":
- if pagediv:
+ elif mode == "dict":
+ if pagediv is not None:
+ viewerurl = docinfo['viewerUrl']
+ selfurl = self.getLink()
# check all a-tags
links = pagediv.findall(".//a")
for l in links:
@@ -363,14 +348,13 @@ class MpdlXmlTextServer(SimpleItem):
if href:
# is link with href
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'):
- # is pollux link
- selfurl = self.absolute_url()
- # change href
- l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl))
- # add target
+ # is dictionary link - change href (keeping parameters)
+ l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
+ # add target to open new page
l.set('target', '_blank')
- if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
+ # TODO: is this needed?
+ if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
selfurl = self.absolute_url()
l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
l.set('target', '_blank')
@@ -378,24 +362,25 @@ class MpdlXmlTextServer(SimpleItem):
l.set('ondblclick', 'popupWin.focus();')
if href.startswith('#note-'):
- l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)))
+ # note link
+ l.set('href', href.replace('#note-',"%s#note-"%selfurl))
return serialize(pagediv)
# xml mode
elif mode == "xml":
- if pagediv:
+ if pagediv is not None:
return serialize(pagediv)
# pureXml mode
elif mode == "pureXml":
- if pagediv:
+ if pagediv is not None:
return serialize(pagediv)
# gis mode
elif mode == "gis":
name = docinfo['name']
- if pagediv:
+ if pagediv is not None:
# check all a-tags
links = pagediv.findall(".//a")
for l in links:
@@ -409,28 +394,10 @@ class MpdlXmlTextServer(SimpleItem):
return "no text here"
- # WTF: is this needed?
- def getOrigPages(self, docinfo=None, pageinfo=None):
- logging.debug("CALLED: getOrigPages!")
- if not pageinfo.has_key('pageNumberOrig'):
- logging.warning("getOrigPages: not in pageinfo!")
- return None
-
- return pageinfo['pageNumberOrig']
-
- # WTF: is this needed?
- def getOrigPagesNorm(self, docinfo=None, pageinfo=None):
- logging.debug("CALLED: getOrigPagesNorm!")
- if not pageinfo.has_key('pageNumberOrigNorm'):
- logging.warning("getOrigPagesNorm: not in pageinfo!")
- return None
-
- return pageinfo['pageNumberOrigNorm']
-
# TODO: should be getWordInfo
- def getTranslate(self, word=None, language=None):
- """translate into another languages"""
- data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html")
+ def getWordInfo(self, word='', language='', display=''):
+ """show information (like dictionaries) about word"""
+ data = self.getServerData("lt/wordInfo.xql","language=%s&word=%s&display=%s&output=html"%(language,urllib.quote(word),urllib.quote(display)))
return data
# WTF: what does this do?
@@ -504,23 +471,15 @@ class MpdlXmlTextServer(SimpleItem):
pagediv = div
elif dc == 'queryResultHits':
- docinfo['tocSize_%s'%mode] = intOr0(div.text)
+ docinfo['tocSize_%s'%mode] = getInt(div.text)
if pagediv:
-# # split xml in chunks
-# tocs = []
-# tocdivs = pagediv.findall('div')
-# for p in zip(tocdivs[::2], tocdivs[1::2]):
-# toc = serialize(p[0])
-# toc += serialize(p[1])
-# tocs.append(toc)
-# logging.debug("pair: %s"%(toc))
# store XML in docinfo
docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
return docinfo
- def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None):
+ def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
"""returns single page from the table of contents"""
logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
if mode == "text":
@@ -535,23 +494,22 @@ class MpdlXmlTextServer(SimpleItem):
tocxml = docinfo.get('tocXML_%s'%mode, None)
if not tocxml:
logging.error("getTocPage: unable to find tocXML")
- return "No ToC"
+ return "Error: no table of contents!"
- pagesize = int(pageinfo['tocPageSize'])
- url = docinfo['url']
- urlmode = docinfo['mode']
- selfurl = docinfo['viewerUrl']
- viewMode= pageinfo['viewMode']
- tocMode = pageinfo['tocMode']
- tocPN = int(pageinfo['tocPN'])
+ if size is None:
+ size = pageinfo.get('tocPageSize', 30)
+
+ if start is None:
+ start = (pn - 1) * size
fulltoc = ET.fromstring(tocxml)
if fulltoc:
# paginate
- #start = (pn - 1) * pagesize * 2
- #end = start + pagesize * 2
- #tocdivs = fulltoc[start:end]
+ first = (start - 1) * 2
+ len = size * 2
+ del fulltoc[:first]
+ del fulltoc[len:]
tocdivs = fulltoc
# check all a-tags
@@ -562,12 +520,23 @@ class MpdlXmlTextServer(SimpleItem):
# take pn from href
m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
if m is not None:
- # and create new url
- l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN))
+ # and create new url (assuming parent is documentViewer)
+ url = self.getLink('pn', m.group(1))
+ l.set('href', url)
else:
logging.warning("getTocPage: Problem with link=%s"%href)
- return serialize(tocdivs)
+ # fix two-divs-per-row with containing div
+ newtoc = ET.Element('div', {'class':'queryResultPage'})
+ for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
+ e = ET.Element('div',{'class':'tocline'})
+ e.append(d1)
+ e.append(d2)
+ newtoc.append(e)
+
+ return serialize(newtoc)
+
+ return "ERROR: no table of contents!"
def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):