Mercurial > hg > documentViewer
changeset 455:0a53fea83df7 elementtree
more work renovating
author | casties |
---|---|
date | Fri, 15 Jul 2011 21:34:41 +0200 |
parents | 73e3273c7624 |
children | b27a7d2f06ff |
files | MpdlXmlTextServer.py MpdlXmlTextServer_old.py documentViewer.py documentViewer_old.py |
diffstat | 4 files changed, 783 insertions(+), 248 deletions(-) [+] |
line wrap: on
line diff
--- a/MpdlXmlTextServer.py Fri Jul 15 11:02:26 2011 +0200 +++ b/MpdlXmlTextServer.py Fri Jul 15 21:34:41 2011 +0200 @@ -1,6 +1,7 @@ from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile + from Ft.Xml import EMPTY_NAMESPACE, Parse from Ft.Xml.Domlette import NonvalidatingReader import Ft.Xml.Domlette @@ -8,13 +9,19 @@ import xml.etree.ElementTree as ET -import md5 -import sys +import re import logging import urllib import documentViewer #from documentViewer import getTextFromNode, serializeNode +def intOr0(s, default=0): + """convert s to int or return default""" + try: + return int(s) + except: + return default + def getText(node): """get the cdata content of a node""" if node is None: @@ -44,11 +51,11 @@ if node is None: return "" # ET: - #text = node.text or "" - #for e in node: - # text += gettext(e) - # if e.tail: - # text += e.tail +# text = node.text or "" +# for e in node: +# text += gettext(e) +# if e.tail: +# text += e.tail # 4Suite: nodelist=node.childNodes @@ -82,8 +89,7 @@ manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) - def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): - #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): + def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): """constructor""" self.id=id @@ -103,8 +109,10 @@ url = self.serverUrl+method return documentViewer.getHttpData(url,data,timeout=self.timeout) + # WTF: what does this really do? can it be integrated in getPage? def getSearch(self, pageinfo=None, docinfo=None): """get search list""" + logging.debug("getSearch()") docpath = docinfo['textURLPath'] url = docinfo['url'] pagesize = pageinfo['queryPageSize'] @@ -207,12 +215,12 @@ hrefList=[] myList= "" text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) - dom = Parse(text) - result = dom.xpath("//result/resultPage/place") + dom = ET.fromstring(text) + result = dom.findall(".//result/resultPage/place") for l in result: - hrefNode= l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue + href = l.get("id") hrefList.append(href) + # WTF: what does this do? myList = ",".join(hrefList) #logging.debug("getGisPlaces :%s"%(myList)) return myList @@ -227,178 +235,125 @@ hrefList=[] myList="" text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) - dom =Parse(text) - result = dom.xpath("//result/resultPage/place") + dom = ET.fromstring(text) + result = dom.findall(".//result/resultPage/place") for l in result: - hrefNode = l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue + href = l.get("id") hrefList.append(href) + # WTF: what does this do? myList = ",".join(hrefList) #logging.debug("getALLGisPlaces :%s"%(myList)) return myList + def processPageInfo(self, dom, docinfo, pageinfo): + """processes page info divs from dom and stores in docinfo and pageinfo""" + # process all toplevel divs + alldivs = dom.findall(".//div") + pagediv = None + for div in alldivs: + dc = div.get('class') + + # page content div + if dc == 'pageContent': + pagediv = div + + # pageNumberOrig + elif dc == 'pageNumberOrig': + pageinfo['pageNumberOrig'] = div.text + + # pageNumberOrigNorm + elif dc == 'pageNumberOrigNorm': + pageinfo['pageNumberOrigNorm'] = div.text + + # pageNumberOrigNorm + elif dc == 'countFigureEntries': + docinfo['countFigureEntries'] = intOr0(div.text) + + # pageNumberOrigNorm + elif dc == 'countTocEntries': + # WTF: s1 = int(s)/30+1 + docinfo['countTocEntries'] = intOr0(div.text) + + # numTextPages + elif dc == 'countPages': + np = intOr0(div.text) + if np > 0: + docinfo['numTextPages'] = np + if docinfo.get('numPages', 0) == 0: + # seems to be text-only + docinfo['numTextPages'] = np + pageinfo['end'] = min(pageinfo['end'], np) + pageinfo['numgroups'] = int(np / pageinfo['groupsize']) + if np % pageinfo['groupsize'] > 0: + pageinfo['numgroups'] += 1 + + return + def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" + logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) + # check for cached text -- but this shouldn't be called twice + if pageinfo.has_key('textPage'): + logging.debug("getTextPage: using cached text") + return pageinfo['textPage'] + docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - url = docinfo.get('url',None) - name = docinfo.get('name',None) - pn =pageinfo['current'] - sn = pageinfo['sn'] - #optionToggle =pageinfo ['optionToggle'] - highlightQuery = pageinfo['highlightQuery'] - #mode = pageinfo ['viewMode'] - tocMode = pageinfo['tocMode'] - characterNormalization=pageinfo['characterNormalization'] - tocPN = pageinfo['tocPN'] - selfurl = self.absolute_url() + # just checking + if pageinfo['current'] != pn: + logging.warning("getTextPage: current!=pn!") + + # stuff for constructing full urls + url = docinfo['url'] + urlmode = docinfo['mode'] + sn = pageinfo.get('sn', None) + highlightQuery = pageinfo.get('highlightQuery', None) + tocMode = pageinfo.get('tocMode', None) + tocPN = pageinfo.get('tocPN',None) + characterNormalization = pageinfo.get('characterNormalization', None) + selfurl = docinfo['viewerUrl'] + if mode == "text_dict": + # text_dict is called textPollux in the backend textmode = "textPollux" else: textmode = mode textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) - if highlightQuery is not None: + if highlightQuery: textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + # fetch the page pagexml = self.getServerData("page-fragment.xql",textParam) dom = ET.fromstring(pagexml) - #dom = NonvalidatingReader.parseStream(pagexml) - - #original Pages - #pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - - """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) - - #original Pages Norm - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) - """ - #figureEntries -# pagedivs = dom.xpath("//div[@class='countFigureEntries']") -# if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): -# if len(pagedivs)>0: -# docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) -# s = getTextFromNode(pagedivs[0]) -# if s=='0': -# try: -# docinfo['countFigureEntries'] = int(s) -# except: -# docinfo['countFigureEntries'] = 0 -# else: -# s1 = int(s)/30+1 -# try: -# docinfo['countFigureEntries'] = int(s1) -# except: -# docinfo['countFigureEntries'] = 0 -# -# #allPlaces -# pagedivs = dom.xpath("//div[@class='countPlaces']") -# if pagedivs == dom.xpath("//div[@class='countPlaces']"): -# if len(pagedivs)>0: -# docinfo['countPlaces']= getTextFromNode(pagedivs[0]) -# s = getTextFromNode(pagedivs[0]) -# try: -# docinfo['countPlaces'] = int(s) -# except: -# docinfo['countPlaces'] = 0 -# -# #tocEntries -# pagedivs = dom.xpath("//div[@class='countTocEntries']") -# if pagedivs == dom.xpath("//div[@class='countTocEntries']"): -# if len(pagedivs)>0: -# docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) -# s = getTextFromNode(pagedivs[0]) -# if s=='0': -# try: -# docinfo['countTocEntries'] = int(s) -# except: -# docinfo['countTocEntries'] = 0 -# else: -# s1 = int(s)/30+1 -# try: -# docinfo['countTocEntries'] = int(s1) -# except: -# docinfo['countTocEntries'] = 0 - - #numTextPages - #pagedivs = dom.xpath("//div[@class='countPages']") + # extract additional info + self.processPageInfo(dom, docinfo, pageinfo) + # page content is in <div class="pageContent"> + pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] alldivs = dom.findall(".//div") - pagediv = None for div in alldivs: dc = div.get('class') + # page content div if dc == 'pageContent': pagediv = div - - if dc == 'countPages': - try: - np = int(div.text) - docinfo['numPages'] = np - pageinfo['end'] = min(pageinfo['end'], np) - pageinfo['numgroups'] = int(np / pageinfo['groupsize']) - if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 - - except: - docinfo['numPages'] = 0 - break - -# ROC: why? -# else: -# #no full text -- init to 0 -# docinfo['pageNumberOrig'] = 0 -# docinfo['countFigureEntries'] = 0 -# docinfo['countPlaces'] = 0 -# docinfo['countTocEntries'] = 0 -# docinfo['numPages'] = 0 -# docinfo['pageNumberOrigNorm'] = 0 -# #return docinfo # plain text mode if mode == "text": - #pagedivs = dom.xpath("/div") if pagediv: links = pagediv.findall(".//a") for l in links: href = l.get('href') if href and href.startswith('#note-'): - href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)) l.set('href', href) - logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) - return serialize(pagediv) - - if mode == "xml": - if pagediv: + return serialize(pagediv) - if mode == "pureXml": - if pagediv: - return serialize(pagediv) - - if mode == "gis": - if pagediv: - # check all a-tags - links = pagediv.findall(".//a") - for l in links: - href = l.get('href') - if href: - if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): - l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) - l.set('target', '_blank') - - return serialize(pagenode) - # text-with-links mode - if mode == "text_dict": + elif mode == "text_dict": if pagediv: # check all a-tags links = pagediv.findall(".//a") @@ -423,58 +378,80 @@ l.set('ondblclick', 'popupWin.focus();') if href.startswith('#note-'): - l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) + l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))) return serialize(pagediv) + # xml mode + elif mode == "xml": + if pagediv: + return serialize(pagediv) + + # pureXml mode + elif mode == "pureXml": + if pagediv: + return serialize(pagediv) + + # gis mode + elif mode == "gis": + name = docinfo['name'] + if pagediv: + # check all a-tags + links = pagediv.findall(".//a") + for l in links: + href = l.get('href') + if href: + if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): + l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) + l.set('target', '_blank') + + return serialize(pagediv) + return "no text here" + # WTF: is this needed? def getOrigPages(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrig'] + logging.debug("CALLED: getOrigPages!") + if not pageinfo.has_key('pageNumberOrig'): + logging.warning("getOrigPages: not in pageinfo!") + return None + + return pageinfo['pageNumberOrig'] + # WTF: is this needed? def getOrigPagesNorm(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrigNorm'] - + logging.debug("CALLED: getOrigPagesNorm!") + if not pageinfo.has_key('pageNumberOrigNorm'): + logging.warning("getOrigPagesNorm: not in pageinfo!") + return None + + return pageinfo['pageNumberOrigNorm'] + # TODO: should be getWordInfo def getTranslate(self, word=None, language=None): """translate into another languages""" data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") - #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) return data + # WTF: what does this do? def getLemma(self, lemma=None, language=None): """simular words lemma """ data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") return data + # WTF: what does this do? def getLemmaQuery(self, query=None, language=None): """simular words lemma """ data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") return data + # WTF: what does this do? def getLex(self, query=None, language=None): #simular words lemma data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) return data - + + # WTF: what does this do? def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): #number of docpath = docinfo['textURLPath'] @@ -493,9 +470,11 @@ return tc def getToc(self, mode="text", docinfo=None): - """loads table of contents and stores in docinfo""" + """loads table of contents and stores XML in docinfo""" + logging.debug("getToc mode=%s"%mode) if mode == "none": - return docinfo + return docinfo + if 'tocSize_%s'%mode in docinfo: # cached toc return docinfo @@ -511,44 +490,87 @@ # number of entries in toc tocSize = 0 tocDiv = None - + # fetch full toc pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) - - # post-processing downloaded xml - pagedom = Parse(pagexml) - # get number of entries - numdivs = pagedom.xpath("//div[@class='queryResultHits']") - if len(numdivs) > 0: - tocSize = int(getTextFromNode(numdivs[0])) - docinfo['tocSize_%s'%mode] = tocSize + dom = ET.fromstring(pagexml) + # page content is in <div class="queryResultPage"> + pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] + alldivs = dom.findall("div") + for div in alldivs: + dc = div.get('class') + # page content div + if dc == 'queryResultPage': + pagediv = div + + elif dc == 'queryResultHits': + docinfo['tocSize_%s'%mode] = intOr0(div.text) + + if pagediv: +# # split xml in chunks +# tocs = [] +# tocdivs = pagediv.findall('div') +# for p in zip(tocdivs[::2], tocdivs[1::2]): +# toc = serialize(p[0]) +# toc += serialize(p[1]) +# tocs.append(toc) +# logging.debug("pair: %s"%(toc)) + # store XML in docinfo + docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') + return docinfo def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): """returns single page from the table of contents""" - # TODO: this should use the cached TOC + logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) if mode == "text": queryType = "toc" else: queryType = mode - docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - pagesize = pageinfo['tocPageSize'] - pn = pageinfo['tocPN'] + + # check for cached TOC + if not docinfo.has_key('tocXML_%s'%mode): + self.getToc(mode=mode, docinfo=docinfo) + + tocxml = docinfo.get('tocXML_%s'%mode, None) + if not tocxml: + logging.error("getTocPage: unable to find tocXML") + return "No ToC" + + pagesize = int(pageinfo['tocPageSize']) url = docinfo['url'] - selfurl = self.absolute_url() + urlmode = docinfo['mode'] + selfurl = docinfo['viewerUrl'] viewMode= pageinfo['viewMode'] - characterNormalization = pageinfo ['characterNormalization'] - #optionToggle =pageinfo ['optionToggle'] tocMode = pageinfo['tocMode'] - tocPN = pageinfo['tocPN'] + tocPN = int(pageinfo['tocPN']) + + fulltoc = ET.fromstring(tocxml) - data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) - page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) - text = page.replace('mode=image','mode=texttool') - return text + if fulltoc: + # paginate + #start = (pn - 1) * pagesize * 2 + #end = start + pagesize * 2 + #tocdivs = fulltoc[start:end] + tocdivs = fulltoc + + # check all a-tags + links = tocdivs.findall(".//a") + for l in links: + href = l.get('href') + if href: + # take pn from href + m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) + if m is not None: + # and create new url + l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN)) + else: + logging.warning("getTocPage: Problem with link=%s"%href) + + return serialize(tocdivs) + def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): - #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout @@ -568,4 +590,6 @@ newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: - RESPONSE.redirect('manage_main') \ No newline at end of file + RESPONSE.redirect('manage_main') + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MpdlXmlTextServer_old.py Fri Jul 15 21:34:41 2011 +0200 @@ -0,0 +1,520 @@ + +from OFS.SimpleItem import SimpleItem +from Products.PageTemplates.PageTemplateFile import PageTemplateFile +from Ft.Xml import EMPTY_NAMESPACE, Parse +from Ft.Xml.Domlette import NonvalidatingReader + +import md5 +import sys +import logging +import urllib +import documentViewer +from documentViewer import getTextFromNode, serializeNode + +class MpdlXmlTextServer(SimpleItem): + """TextServer implementation for MPDL-XML eXist server""" + meta_type="MPDL-XML TextServer" + + manage_options=( + {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, + )+SimpleItem.manage_options + + manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) + + def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): + #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): + + """constructor""" + self.id=id + self.title=title + self.timeout = timeout + if serverName is None: + self.serverUrl = serverUrl + else: + self.serverUrl = "http://%s/mpdl/interface/"%serverName + + def getHttpData(self, url, data=None): + """returns result from url+data HTTP request""" + return documentViewer.getHttpData(url,data,timeout=self.timeout) + + def getServerData(self, method, data=None): + """returns result from text server for method+data""" + url = self.serverUrl+method + return documentViewer.getHttpData(url,data,timeout=self.timeout) + + def getSearch(self, pageinfo=None, docinfo=None): + """get search list""" + docpath = docinfo['textURLPath'] + url = docinfo['url'] + pagesize = pageinfo['queryPageSize'] + pn = pageinfo.get('searchPN',1) + sn = pageinfo['sn'] + highlightQuery = pageinfo['highlightQuery'] + query =pageinfo['query'] + queryType =pageinfo['queryType'] + viewMode= pageinfo['viewMode'] + tocMode = pageinfo['tocMode'] + characterNormalization = pageinfo['characterNormalization'] + #optionToggle = pageinfo['optionToggle'] + tocPN = pageinfo['tocPN'] + selfurl = self.absolute_url() + data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) + pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) + pagedom = Parse(pagexml) + + """ + pagedivs = pagedom.xpath("//div[@class='queryResultHits']") + if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")): + if len(pagedivs)>0: + docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0])) + s = getTextFromNode(pagedivs[0]) + s1 = int(s)/10+1 + try: + docinfo['queryResultHits'] = int(s1) + logging.debug("SEARCH ENTRIES: %s"%(s1)) + except: + docinfo['queryResultHits'] = 0 + """ + if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): + pagedivs = pagedom.xpath("//div[@class='queryResultPage']") + if len(pagedivs)>0: + pagenode=pagedivs[0] + links=pagenode.xpath("//a") + for l in links: + hrefNode = l.getAttributeNodeNS(None, u"href") + if hrefNode: + href = hrefNode.nodeValue + if href.startswith('page-fragment.xql'): + selfurl = self.absolute_url() + pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) + hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) + #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) + return serializeNode(pagenode) + if (queryType=="fulltextMorph"): + pagedivs = pagedom.xpath("//div[@class='queryResult']") + if len(pagedivs)>0: + pagenode=pagedivs[0] + links=pagenode.xpath("//a") + for l in links: + hrefNode = l.getAttributeNodeNS(None, u"href") + if hrefNode: + href = hrefNode.nodeValue + if href.startswith('page-fragment.xql'): + selfurl = self.absolute_url() + pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) + hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) + if href.startswith('../lt/lemma.xql'): + hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) + l.setAttributeNS(None, 'target', '_blank') + l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") + l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') + pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") + return serializeNode(pagenode) + if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): + pagedivs= pagedom.xpath("//div[@class='queryResultPage']") + if len(pagedivs)>0: + pagenode=pagedivs[0] + links=pagenode.xpath("//a") + for l in links: + hrefNode = l.getAttributeNodeNS(None, u"href") + if hrefNode: + href = hrefNode.nodeValue + hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) + if href.startswith('../lt/lex.xql'): + hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) + l.setAttributeNS(None, 'target', '_blank') + l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") + l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') + if href.startswith('../lt/lemma.xql'): + hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) + l.setAttributeNS(None, 'target', '_blank') + l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") + l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') + return serializeNode(pagenode) + return "no text here" + + def getGisPlaces(self, docinfo=None, pageinfo=None): + """ Show all Gis Places of whole Page""" + xpath='//place' + docpath = docinfo.get('textURLPath',None) + if not docpath: + return None + + url = docinfo['url'] + selfurl = self.absolute_url() + pn = pageinfo['current'] + hrefList=[] + myList= "" + text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) + dom = Parse(text) + result = dom.xpath("//result/resultPage/place") + for l in result: + hrefNode= l.getAttributeNodeNS(None, u"id") + href= hrefNode.nodeValue + hrefList.append(href) + myList = ",".join(hrefList) + #logging.debug("getGisPlaces :%s"%(myList)) + return myList + + def getAllGisPlaces (self, docinfo=None, pageinfo=None): + """Show all Gis Places of whole Book """ + xpath ='//echo:place' + docpath =docinfo['textURLPath'] + url = docinfo['url'] + selfurl =self.absolute_url() + pn =pageinfo['current'] + hrefList=[] + myList="" + text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) + dom =Parse(text) + result = dom.xpath("//result/resultPage/place") + + for l in result: + hrefNode = l.getAttributeNodeNS(None, u"id") + href= hrefNode.nodeValue + hrefList.append(href) + myList = ",".join(hrefList) + #logging.debug("getALLGisPlaces :%s"%(myList)) + return myList + + + def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): + """returns single page from fulltext""" + docpath = docinfo['textURLPath'] + path = docinfo['textURLPath'] + url = docinfo.get('url',None) + name = docinfo.get('name',None) + pn =pageinfo['current'] + sn = pageinfo['sn'] + #optionToggle =pageinfo ['optionToggle'] + highlightQuery = pageinfo['highlightQuery'] + #mode = pageinfo ['viewMode'] + tocMode = pageinfo['tocMode'] + characterNormalization=pageinfo['characterNormalization'] + tocPN = pageinfo['tocPN'] + selfurl = self.absolute_url() + if mode == "text_dict": + textmode = "textPollux" + else: + textmode = mode + + textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) + if highlightQuery is not None: + textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + + pagexml = self.getServerData("page-fragment.xql",textParam) + dom = Parse(pagexml) + #dom = NonvalidatingReader.parseStream(pagexml) + + #original Pages + pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + + """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): + if len(pagedivs)>0: + docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) + logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) + + #original Pages Norm + pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") + if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): + if len(pagedivs)>0: + docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) + logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) + """ + #figureEntries + pagedivs = dom.xpath("//div[@class='countFigureEntries']") + if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): + if len(pagedivs)>0: + docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) + if s=='0': + try: + docinfo['countFigureEntries'] = int(s) + except: + docinfo['countFigureEntries'] = 0 + else: + s1 = int(s)/30+1 + try: + docinfo['countFigureEntries'] = int(s1) + except: + docinfo['countFigureEntries'] = 0 + + #allPlaces + pagedivs = dom.xpath("//div[@class='countPlaces']") + if pagedivs == dom.xpath("//div[@class='countPlaces']"): + if len(pagedivs)>0: + docinfo['countPlaces']= getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) + try: + docinfo['countPlaces'] = int(s) + except: + docinfo['countPlaces'] = 0 + + #tocEntries + pagedivs = dom.xpath("//div[@class='countTocEntries']") + if pagedivs == dom.xpath("//div[@class='countTocEntries']"): + if len(pagedivs)>0: + docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) + s = getTextFromNode(pagedivs[0]) + if s=='0': + try: + docinfo['countTocEntries'] = int(s) + except: + docinfo['countTocEntries'] = 0 + else: + s1 = int(s)/30+1 + try: + docinfo['countTocEntries'] = int(s1) + except: + docinfo['countTocEntries'] = 0 + + #numTextPages + pagedivs = dom.xpath("//div[@class='countPages']") + if pagedivs == dom.xpath("//div[@class='countPages']"): + if len(pagedivs)>0: + docinfo['numPages'] = getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) + + try: + docinfo['numPages'] = int(s) + #logging.debug("PAGE NUMBER: %s"%(s)) + + np = docinfo['numPages'] + pageinfo['end'] = min(pageinfo['end'], np) + pageinfo['numgroups'] = int(np / pageinfo['groupsize']) + if np % pageinfo['groupsize'] > 0: + pageinfo['numgroups'] += 1 + except: + docinfo['numPages'] = 0 + + else: + #no full text -- init to 0 + docinfo['pageNumberOrig'] = 0 + docinfo['countFigureEntries'] = 0 + docinfo['countPlaces'] = 0 + docinfo['countTocEntries'] = 0 + docinfo['numPages'] = 0 + docinfo['pageNumberOrigNorm'] = 0 + #return docinfo + + # plain text mode + if mode == "text": + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + links = pagenode.xpath("//a") + for l in links: + hrefNode = l.getAttributeNodeNS(None, u"href") + if hrefNode: + href= hrefNode.nodeValue + if href.startswith('#note-'): + hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + return serializeNode(pagenode) + if mode == "xml": + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + return serializeNode(pagenode) + if mode == "gis": + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + links =pagenode.xpath("//a") + for l in links: + hrefNode =l.getAttributeNodeNS(None, u"href") + if hrefNode: + href=hrefNode.nodeValue + if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): + hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) + l.setAttributeNS(None, 'target', '_blank') + return serializeNode(pagenode) + + if mode == "pureXml": + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + return serializeNode(pagenode) + # text-with-links mode + if mode == "text_dict": + # first div contains text + #mode = pageinfo ['viewMode'] + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + # check all a-tags + links = pagenode.xpath("//a") + + for l in links: + hrefNode = l.getAttributeNodeNS(None, u"href") + + if hrefNode: + # is link with href + href = hrefNode.nodeValue + if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): + # is pollux link + selfurl = self.absolute_url() + # change href + hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) + # add target + l.setAttributeNS(None, 'target', '_blank') + #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") + #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") + + if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): + selfurl = self.absolute_url() + hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) + l.setAttributeNS(None, 'target', '_blank') + l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') + + if href.startswith('#note-'): + hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + + return serializeNode(pagenode) + return "no text here" + + def getOrigPages(self, docinfo=None, pageinfo=None): + docpath = docinfo['textURLPath'] + pn =pageinfo['current'] + selfurl = self.absolute_url() + pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) + dom = Parse(pagexml) + pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): + if len(pagedivs)>0: + docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) + return docinfo['pageNumberOrig'] + + def getOrigPagesNorm(self, docinfo=None, pageinfo=None): + docpath = docinfo['textURLPath'] + pn =pageinfo['current'] + selfurl = self.absolute_url() + pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) + dom = Parse(pagexml) + pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") + if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): + if len(pagedivs)>0: + docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) + return docinfo['pageNumberOrigNorm'] + + + def getTranslate(self, word=None, language=None): + """translate into another languages""" + data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") + #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) + return data + + def getLemma(self, lemma=None, language=None): + """simular words lemma """ + data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") + return data + + def getLemmaQuery(self, query=None, language=None): + """simular words lemma """ + data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") + return data + + def getLex(self, query=None, language=None): + #simular words lemma + data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) + return data + + def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): + #number of + docpath = docinfo['textURLPath'] + pagesize = pageinfo['queryPageSize'] + pn = pageinfo['searchPN'] + query =pageinfo['query'] + queryType =pageinfo['queryType'] + tocSearch = 0 + tocDiv = None + + pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) + pagedom = Parse(pagexml) + numdivs = pagedom.xpath("//div[@class='queryResultHits']") + tocSearch = int(getTextFromNode(numdivs[0])) + tc=int((tocSearch/10)+1) + return tc + + def getToc(self, mode="text", docinfo=None): + """loads table of contents and stores in docinfo""" + if mode == "none": + return docinfo + if 'tocSize_%s'%mode in docinfo: + # cached toc + return docinfo + + docpath = docinfo['textURLPath'] + # we need to set a result set size + pagesize = 1000 + pn = 1 + if mode == "text": + queryType = "toc" + else: + queryType = mode + # number of entries in toc + tocSize = 0 + tocDiv = None + + pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) + + # post-processing downloaded xml + pagedom = Parse(pagexml) + # get number of entries + numdivs = pagedom.xpath("//div[@class='queryResultHits']") + if len(numdivs) > 0: + tocSize = int(getTextFromNode(numdivs[0])) + docinfo['tocSize_%s'%mode] = tocSize + return docinfo + + def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): + """returns single page from the table of contents""" + # TODO: this should use the cached TOC + if mode == "text": + queryType = "toc" + else: + queryType = mode + docpath = docinfo['textURLPath'] + path = docinfo['textURLPath'] + pagesize = pageinfo['tocPageSize'] + pn = pageinfo['tocPN'] + url = docinfo['url'] + selfurl = self.absolute_url() + viewMode= pageinfo['viewMode'] + characterNormalization = pageinfo ['characterNormalization'] + #optionToggle =pageinfo ['optionToggle'] + tocMode = pageinfo['tocMode'] + tocPN = pageinfo['tocPN'] + + data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) + page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) + text = page.replace('mode=image','mode=texttool') + return text + + def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): + #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): + """change settings""" + self.title=title + self.timeout = timeout + self.serverUrl = serverUrl + if RESPONSE is not None: + RESPONSE.redirect('manage_main') + +# management methods +def manage_addMpdlXmlTextServerForm(self): + """Form for adding""" + pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) + return pt() + +def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): +#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): + """add zogiimage""" + newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) + self.Destination()._setObject(id, newObj) + if RESPONSE is not None: + RESPONSE.redirect('manage_main') \ No newline at end of file
--- a/documentViewer.py Fri Jul 15 11:02:26 2011 +0200 +++ b/documentViewer.py Fri Jul 15 21:34:41 2011 +0200 @@ -354,21 +354,26 @@ if tocMode != "thumbs": # get table of contents docinfo = self.getToc(mode=tocMode, docinfo=docinfo) - - if viewMode=="auto": # automodus gewaehlt - if docinfo.has_key('textURL') or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert + + # auto viewMode: text_dict if text else images + if viewMode=="auto": + if docinfo.get('textURL', None) or docinfo.get('textURLPath', None): + #texturl gesetzt und textViewer konfiguriert viewMode="text_dict" else: viewMode="images" - pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo,viewMode=viewMode,tocMode=tocMode) + pageinfo = self.getPageinfo(start=start, current=pn, docinfo=docinfo, viewMode=viewMode, tocMode=tocMode) - if (docinfo.get('textURLPath',None)): - page = self.getTextPage(mode=viewMode, docinfo=docinfo, pageinfo=pageinfo) + if viewMode != 'images' and docinfo.get('textURLPath', None): + # get full text page + page = self.getTextPage(mode=viewMode, pn=pn, docinfo=docinfo, pageinfo=pageinfo) pageinfo['textPage'] = page - tt = getattr(self, 'template') - pt = getattr(tt, 'viewer_main') - return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode,mk=self.generateMarks(mk)) + + # get template /template/viewer_main + pt = getattr(self.template, 'viewer_main') + # and execute with parameters + return pt(docinfo=docinfo, pageinfo=pageinfo, viewMode=viewMode, mk=self.generateMarks(mk)) def generateMarks(self,mk): ret="" @@ -866,15 +871,21 @@ docinfo = self.REQUEST.SESSION['docinfo'] # check if its still current if docinfo is not None and docinfo.get('mode') == mode and docinfo.get('url') == url: - logging.debug("documentViewer (getdocinfo) docinfo in session: %s"%docinfo) + logging.debug("documentViewer (getdocinfo) docinfo in session. keys=%s"%docinfo.keys()) return docinfo + # new docinfo docinfo = {'mode': mode, 'url': url} - if mode=="texttool": #index.meta with texttool information + # add self url + docinfo['viewerUrl'] = self.getDocumentViewerURL() + if mode=="texttool": + # index.meta with texttool information docinfo = self.getDocinfoFromTextTool(url, docinfo=docinfo) elif mode=="imagepath": + # folder with images, index.meta optional docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo) elif mode=="filepath": + # filename docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo,cut=1) else: logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode)
--- a/documentViewer_old.py Fri Jul 15 11:02:26 2011 +0200 +++ b/documentViewer_old.py Fri Jul 15 21:34:41 2011 +0200 @@ -9,9 +9,6 @@ from Ft.Xml import EMPTY_NAMESPACE, Parse import Ft.Xml.Domlette - -import xml.etree.ElementTree as ET - import os.path import sys import urllib @@ -35,35 +32,25 @@ except: return int(default) -def getTextFromNode(node): +def getTextFromNode(nodename): """get the cdata content of a node""" - if node is None: + if nodename is None: return "" - # ET: - text = node.text or "" - for e in node: - text += gettext(e) - if e.tail: - text += e.tail - - # 4Suite: - #nodelist=node.childNodes - #text = "" - #for n in nodelist: - # if n.nodeType == node.TEXT_NODE: - # text = text + n.data - - return text + nodelist=nodename.childNodes + rc = "" + for node in nodelist: + if node.nodeType == node.TEXT_NODE: + rc = rc + node.data + return rc def serializeNode(node, encoding="utf-8"): """returns a string containing node as XML""" - s = ET.tostring(node) - - # 4Suite: - # stream = cStringIO.StringIO() - # Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) - # s = stream.getvalue() - # stream.close() + stream = cStringIO.StringIO() + #logging.debug("BUF: %s"%(stream)) + Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) + s = stream.getvalue() + #logging.debug("BUF: %s"%(s)) + stream.close() return s def browserCheck(self): @@ -509,10 +496,8 @@ if txt is None: raise IOError("Unable to get dir-info from %s"%(infoUrl)) - dom = ET.fromstring(txt).getroot() - #dom = Parse(txt) - sizes=dom.find("//dir/size") - #sizes=dom.xpath("//dir/size") + dom = Parse(txt) + sizes=dom.xpath("//dir/size") logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes) if sizes: @@ -561,8 +546,7 @@ if txt is None: raise IOError("Unable to read index meta from %s"%(url)) - dom = ET.fromstring(txt).getroot() - #dom = Parse(txt) + dom = Parse(txt) return dom def getPresentationInfoXML(self, url): @@ -581,8 +565,7 @@ if txt is None: raise IOError("Unable to read infoXMLfrom %s"%(url)) - dom = ET.fromstring(txt).getroot() - #dom = Parse(txt) + dom = Parse(txt) return dom @@ -600,8 +583,7 @@ path=getParentDir(path) dom = self.getDomFromIndexMeta(path) - acctype = dom.find("//access-conditions/access/@type") - #acctype = dom.xpath("//access-conditions/access/@type") + acctype = dom.xpath("//access-conditions/access/@type") if acctype and (len(acctype)>0): access=acctype[0].value if access in ['group', 'institution']: @@ -627,8 +609,7 @@ logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path)) # put in all raw bib fields as dict "bib" - bib = dom.find("//bib/*") - #bib = dom.xpath("//bib/*") + bib = dom.xpath("//bib/*") if bib and len(bib)>0: bibinfo = {} for e in bib: @@ -637,8 +618,7 @@ # extract some fields (author, title, year) according to their mapping metaData=self.metadata.main.meta.bib - bibtype=dom.find("//bib/@type") - #bibtype=dom.xpath("//bib/@type") + bibtype=dom.xpath("//bib/@type") if bibtype and (len(bibtype)>0): bibtype=bibtype[0].value else: