--- documentViewer/documentViewer.py 2011/06/14 09:57:11 1.175 +++ documentViewer/documentViewer.py 2011/07/15 09:02:26 1.175.2.2 @@ -7,8 +7,11 @@ from AccessControl import getSecurityMan from Globals import package_home from Products.zogiLib.zogiLib import browserCheck -from Ft.Xml import EMPTY_NAMESPACE, Parse -import Ft.Xml.Domlette +#from Ft.Xml import EMPTY_NAMESPACE, Parse +#import Ft.Xml.Domlette + +import xml.etree.ElementTree as ET + import os.path import sys import urllib @@ -16,7 +19,6 @@ import urllib2 import logging import math import urlparse -import cStringIO import re import string @@ -32,25 +34,37 @@ def getInt(number, default=0): except: return int(default) -def getTextFromNode(nodename): +def getText(node): """get the cdata content of a node""" - if nodename is None: + if node is None: return "" - nodelist=nodename.childNodes - rc = "" - for node in nodelist: - if node.nodeType == node.TEXT_NODE: - rc = rc + node.data - return rc + # ET: + text = node.text or "" + for e in node: + text += gettext(e) + if e.tail: + text += e.tail + + # 4Suite: + #nodelist=node.childNodes + #text = "" + #for n in nodelist: + # if n.nodeType == node.TEXT_NODE: + # text = text + n.data + + return text + +getTextFromNode = getText def serializeNode(node, encoding="utf-8"): """returns a string containing node as XML""" - stream = cStringIO.StringIO() - #logging.debug("BUF: %s"%(stream)) - Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) - s = stream.getvalue() - #logging.debug("BUF: %s"%(s)) - stream.close() + s = ET.tostring(node) + + # 4Suite: + # stream = cStringIO.StringIO() + # Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) + # s = stream.getvalue() + # stream.close() return s def browserCheck(self): @@ -350,7 +364,7 @@ class documentViewer(Folder): pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo,viewMode=viewMode,tocMode=tocMode) if (docinfo.get('textURLPath',None)): - page = self.getTextPage(docinfo=docinfo, pageinfo=pageinfo) + page = self.getTextPage(mode=viewMode, docinfo=docinfo, pageinfo=pageinfo) pageinfo['textPage'] = page tt = getattr(self, 'template') pt = getattr(tt, 'viewer_main') @@ -485,8 +499,7 @@ class documentViewer(Folder): docinfo = {} for x in range(cut): - - path=getParentDir(path) + path=getParentDir(path) infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path @@ -496,12 +509,14 @@ class documentViewer(Folder): if txt is None: raise IOError("Unable to get dir-info from %s"%(infoUrl)) - dom = Parse(txt) - sizes=dom.xpath("//dir/size") - logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes) + dom = ET.fromstring(txt) + #dom = Parse(txt) + size=getText(dom.find("size")) + #sizes=dom.xpath("//dir/size") + logging.debug("documentViewer (getparamfromdigilib) dirInfo:size=%s"%size) - if sizes: - docinfo['numPages'] = int(getTextFromNode(sizes[0])) + if size: + docinfo['numPages'] = int(size) else: docinfo['numPages'] = 0 @@ -546,7 +561,8 @@ class documentViewer(Folder): if txt is None: raise IOError("Unable to read index meta from %s"%(url)) - dom = Parse(txt) + dom = ET.fromstring(txt) + #dom = Parse(txt) return dom def getPresentationInfoXML(self, url): @@ -565,7 +581,8 @@ class documentViewer(Folder): if txt is None: raise IOError("Unable to read infoXMLfrom %s"%(url)) - dom = Parse(txt) + dom = ET.fromstring(txt) + #dom = Parse(txt) return dom @@ -583,11 +600,14 @@ class documentViewer(Folder): path=getParentDir(path) dom = self.getDomFromIndexMeta(path) - acctype = dom.xpath("//access-conditions/access/@type") - if acctype and (len(acctype)>0): - access=acctype[0].value - if access in ['group', 'institution']: - access = getTextFromNode(dom.xpath("//access-conditions/access/name")[0]).lower() + acc = dom.find(".//access-conditions/access") + if acc is not None: + acctype = acc.get('type') + #acctype = dom.xpath("//access-conditions/access/@type") + if acctype: + access=acctype + if access in ['group', 'institution']: + access = dom.find(".//access-conditions/access/name").text.lower() docinfo['accessType'] = access return docinfo @@ -609,19 +629,20 @@ class documentViewer(Folder): logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path)) # put in all raw bib fields as dict "bib" - bib = dom.xpath("//bib/*") - if bib and len(bib)>0: + bib = dom.find(".//bib") + #bib = dom.xpath("//bib/*") + if bib is not None: bibinfo = {} for e in bib: - bibinfo[e.localName] = getTextFromNode(e) + bibinfo[e.tag] = getText(e) + docinfo['bib'] = bibinfo # extract some fields (author, title, year) according to their mapping metaData=self.metadata.main.meta.bib - bibtype=dom.xpath("//bib/@type") - if bibtype and (len(bibtype)>0): - bibtype=bibtype[0].value - else: + bibtype=bib.get("type") + #bibtype=dom.xpath("//bib/@type") + if not bibtype: bibtype="generic" bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC) @@ -630,64 +651,67 @@ class documentViewer(Folder): logging.debug("documentViewer (getbibinfofromindexmeta) bibmap:"+repr(bibmap)) logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype)) # if there is no mapping bibmap is empty (mapping sometimes has empty fields) - if len(bibmap) > 0 and len(bibmap['author'][0]) > 0: + if len(bibmap) > 0 and len(bibmap['author'][0]) > 0 or len(bibmap['title'][0]) > 0: try: - docinfo['author']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['author'][0])[0]) + docinfo['author']=getText(bib.find(bibmap['author'][0])) except: pass try: - docinfo['title']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['title'][0])[0]) + docinfo['title']=getText(bib.find(bibmap['title'][0])) except: pass try: - docinfo['year']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['year'][0])[0]) + docinfo['year']=getText(bib.find(bibmap['year'][0])) except: pass - logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype) - try: - docinfo['lang']=getTextFromNode(dom.xpath("//bib/lang")[0]) - except: - docinfo['lang']='' - try: - docinfo['city']=getTextFromNode(dom.xpath("//bib/city")[0]) - except: - docinfo['city']='' - try: - docinfo['number_of_pages']=getTextFromNode(dom.xpath("//bib/number_of_pages")[0]) - except: - docinfo['number_of_pages']='' - try: - docinfo['series_volume']=getTextFromNode(dom.xpath("//bib/series_volume")[0]) - except: - docinfo['series_volume']='' - try: - docinfo['number_of_volumes']=getTextFromNode(dom.xpath("//bib/number_of_volumes")[0]) - except: - docinfo['number_of_volumes']='' - try: - docinfo['translator']=getTextFromNode(dom.xpath("//bib/translator")[0]) - except: - docinfo['translator']='' - try: - docinfo['edition']=getTextFromNode(dom.xpath("//bib/edition")[0]) - except: - docinfo['edition']='' - try: - docinfo['series_author']=getTextFromNode(dom.xpath("//bib/series_author")[0]) - except: - docinfo['series_author']='' - try: - docinfo['publisher']=getTextFromNode(dom.xpath("//bib/publisher")[0]) - except: - docinfo['publisher']='' - try: - docinfo['series_title']=getTextFromNode(dom.xpath("//bib/series_title")[0]) - except: - docinfo['series_title']='' - try: - docinfo['isbn_issn']=getTextFromNode(dom.xpath("//bib/isbn_issn")[0]) - except: - docinfo['isbn_issn']='' + + # ROC: why is this here? + # logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype) + # try: + # docinfo['lang']=getTextFromNode(dom.find(".//bib/lang")[0]) + # except: + # docinfo['lang']='' + # try: + # docinfo['city']=getTextFromNode(dom.find(".//bib/city")[0]) + # except: + # docinfo['city']='' + # try: + # docinfo['number_of_pages']=getTextFromNode(dom.find(".//bib/number_of_pages")[0]) + # except: + # docinfo['number_of_pages']='' + # try: + # docinfo['series_volume']=getTextFromNode(dom.find(".//bib/series_volume")[0]) + # except: + # docinfo['series_volume']='' + # try: + # docinfo['number_of_volumes']=getTextFromNode(dom.find(".//bib/number_of_volumes")[0]) + # except: + # docinfo['number_of_volumes']='' + # try: + # docinfo['translator']=getTextFromNode(dom.find(".//bib/translator")[0]) + # except: + # docinfo['translator']='' + # try: + # docinfo['edition']=getTextFromNode(dom.find(".//bib/edition")[0]) + # except: + # docinfo['edition']='' + # try: + # docinfo['series_author']=getTextFromNode(dom.find(".//bib/series_author")[0]) + # except: + # docinfo['series_author']='' + # try: + # docinfo['publisher']=getTextFromNode(dom.find(".//bib/publisher")[0]) + # except: + # docinfo['publisher']='' + # try: + # docinfo['series_title']=getTextFromNode(dom.find(".//bib/series_title")[0]) + # except: + # docinfo['series_title']='' + # try: + # docinfo['isbn_issn']=getTextFromNode(dom.find(".//bib/isbn_issn")[0]) + # except: + # docinfo['isbn_issn']='' return docinfo - + + # TODO: is this needed? def getNameFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): """gets name info from the index.meta file at path or given by dom""" if docinfo is None: @@ -698,7 +722,7 @@ class documentViewer(Folder): path=getParentDir(path) dom = self.getDomFromIndexMeta(path) - docinfo['name']=getTextFromNode(dom.xpath("/resource/name")[0]) + docinfo['name']=getText(dom.find("name")) logging.debug("documentViewer docinfo[name] %s"%docinfo['name']) return docinfo @@ -715,15 +739,12 @@ class documentViewer(Folder): archivePath = None archiveName = None - archiveNames = dom.xpath("//resource/name") - if archiveNames and (len(archiveNames) > 0): - archiveName = getTextFromNode(archiveNames[0]) - else: + archiveName = getText(dom.find("name")) + if not archiveName: logging.warning("documentViewer (getdocinfofromtexttool) resource/name missing in: %s" % (url)) - archivePaths = dom.xpath("//resource/archive-path") - if archivePaths and (len(archivePaths) > 0): - archivePath = getTextFromNode(archivePaths[0]) + archivePath = getText(dom.find("archive-path")) + if archivePath: # clean up archive path if archivePath[0] != '/': archivePath = '/' + archivePath @@ -739,11 +760,9 @@ class documentViewer(Folder): # we balk without archive-path raise IOError("Missing archive-path (for text-tool) in %s" % (url)) - imageDirs = dom.xpath("//texttool/image") - if imageDirs and (len(imageDirs) > 0): - imageDir = getTextFromNode(imageDirs[0]) + imageDir = getText(dom.find(".//texttool/image")) - else: + if not imageDir: # we balk with no image tag / not necessary anymore because textmode is now standard #raise IOError("No text-tool info in %s"%(url)) imageDir = "" @@ -760,15 +779,13 @@ class documentViewer(Folder): docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imageDir - viewerUrls = dom.xpath("//texttool/digiliburlprefix") - if viewerUrls and (len(viewerUrls) > 0): - viewerUrl = getTextFromNode(viewerUrls[0]) + viewerUrl = getText(dom.find(".//texttool/digiliburlprefix")) + if viewerUrl: docinfo['viewerURL'] = viewerUrl # old style text URL - textUrls = dom.xpath("//texttool/text") - if textUrls and (len(textUrls) > 0): - textUrl = getTextFromNode(textUrls[0]) + textUrl = getText(dom.find(".//texttool/text")) + if textUrl: if urlparse.urlparse(textUrl)[0] == "": #keine url textUrl = os.path.join(archivePath, textUrl) # fix URLs starting with /mpiwg/online @@ -778,9 +795,8 @@ class documentViewer(Folder): docinfo['textURL'] = textUrl # new style text-url-path - textUrls = dom.xpath("//texttool/text-url-path") - if textUrls and (len(textUrls) > 0): - textUrl = getTextFromNode(textUrls[0]) + textUrl = getText(dom.find(".//texttool/text-url-path")) + if textUrl: docinfo['textURLPath'] = textUrl textUrlkurz = string.split(textUrl, ".")[0] docinfo['textURLPathkurz'] = textUrlkurz @@ -789,15 +805,16 @@ class documentViewer(Folder): #docinfo = self.getNumTextPages(docinfo) - presentationUrls = dom.xpath("//texttool/presentation") + presentationUrl = getText(dom.find(".//texttool/presentation")) docinfo = self.getBibinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get info von bib tag + # TODO: is this needed here? docinfo = self.getNameFromIndexMeta(url, docinfo=docinfo, dom=dom) - if presentationUrls and (len(presentationUrls) > 0): # ueberschreibe diese durch presentation informationen + if presentationUrl: # ueberschreibe diese durch presentation informationen # presentation url ergiebt sich ersetzen von index.meta in der url der fuer die Metadaten # durch den relativen Pfad auf die presentation infos - presentationPath = getTextFromNode(presentationUrls[0]) + presentationPath = presentationUrl if url.endswith("index.meta"): presentationUrl = url.replace('index.meta', presentationPath) else: @@ -814,18 +831,9 @@ class documentViewer(Folder): """gets the bibliographical information from the preseantion entry in texttools """ dom=self.getPresentationInfoXML(url) - try: - docinfo['author']=getTextFromNode(dom.xpath("//author")[0]) - except: - pass - try: - docinfo['title']=getTextFromNode(dom.xpath("//title")[0]) - except: - pass - try: - docinfo['year']=getTextFromNode(dom.xpath("//date")[0]) - except: - pass + docinfo['author']=getText(dom.find(".//author")) + docinfo['title']=getText(dom.find(".//title")) + docinfo['year']=getText(dom.find(".//date")) return docinfo def getDocinfoFromImagePath(self,path,docinfo=None,cut=0):