documentViewer/documentViewer.py - diff

Return to documentViewer.py CVS log

Up to [Repository] / documentViewer

Diff for /documentViewer/documentViewer.py between versions 1.170 and 1.175.2.2

version 1.170, 2011/03/01 14:53:15	version 1.175.2.2, 2011/07/15 09:02:26
Line 7 from AccessControl import getSecurityMan	Line 7 from AccessControl import getSecurityMan
from Globals import package_home	from Globals import package_home
from Products.zogiLib.zogiLib import browserCheck	from Products.zogiLib.zogiLib import browserCheck

from Ft.Xml import EMPTY_NAMESPACE, Parse	#from Ft.Xml import EMPTY_NAMESPACE, Parse
import Ft.Xml.Domlette	#import Ft.Xml.Domlette

	import xml.etree.ElementTree as ET

import os.path	import os.path
import sys	import sys
import urllib	import urllib
Line 16 import urllib2	Line 19 import urllib2
import logging	import logging
import math	import math
import urlparse	import urlparse
import cStringIO
import re	import re
import string	import string

Line 32 def getInt(number, default=0):	Line 34 def getInt(number, default=0):
except:	except:
return int(default)	return int(default)

def getTextFromNode(nodename):	def getText(node):
"""get the cdata content of a node"""	"""get the cdata content of a node"""
if nodename is None:	if node is None:
return ""	return ""
nodelist=nodename.childNodes	# ET:
rc = ""	text = node.text or ""
for node in nodelist:	for e in node:
if node.nodeType == node.TEXT_NODE:	text += gettext(e)
rc = rc + node.data	if e.tail:
return rc	text += e.tail

	# 4Suite:
	#nodelist=node.childNodes
	#text = ""
	#for n in nodelist:
	# if n.nodeType == node.TEXT_NODE:
	# text = text + n.data

	return text

	getTextFromNode = getText

def serializeNode(node, encoding="utf-8"):	def serializeNode(node, encoding="utf-8"):
"""returns a string containing node as XML"""	"""returns a string containing node as XML"""
stream = cStringIO.StringIO()	s = ET.tostring(node)
logging.debug("BUF: %s"%(stream))
Ft.Xml.Domlette.PrettyPrint(node, stream=stream, encoding=encoding)	# 4Suite:
s = stream.getvalue()	# stream = cStringIO.StringIO()
logging.debug("BUF: %s"%(s))	# Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding)
stream.close()	# s = stream.getvalue()
	# stream.close()
return s	return s

def browserCheck(self):	def browserCheck(self):
Line 231 class documentViewer(Folder):	Line 245 class documentViewer(Folder):
"""get page"""	"""get page"""
return self.template.fulltextclient.getTextPage(**args)	return self.template.fulltextclient.getTextPage(**args)

	def getOrigPages(self, **args):
	"""get page"""
	return self.template.fulltextclient.getOrigPages(**args)

	def getOrigPagesNorm(self, **args):
	"""get page"""
	return self.template.fulltextclient.getOrigPagesNorm(**args)

def getQuery(self, **args):	def getQuery(self, **args):
"""get query in search"""	"""get query in search"""
return self.template.fulltextclient.getQuery(**args)	return self.template.fulltextclient.getQuery(**args)
Line 255 class documentViewer(Folder):	Line 277 class documentViewer(Folder):
"""get lemma"""	"""get lemma"""
return self.template.fulltextclient.getLemma(**args)	return self.template.fulltextclient.getLemma(**args)

	def getLemmaQuery(self, **args):
	"""get query"""
	return self.template.fulltextclient.getLemmaQuery(**args)

	def getLex(self, **args):
	"""get lex"""
	return self.template.fulltextclient.getLex(**args)

def getToc(self, **args):	def getToc(self, **args):
"""get toc"""	"""get toc"""
return self.template.fulltextclient.getToc(**args)	return self.template.fulltextclient.getToc(**args)
Line 334 class documentViewer(Folder):	Line 364 class documentViewer(Folder):
pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo,viewMode=viewMode,tocMode=tocMode)	pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo,viewMode=viewMode,tocMode=tocMode)

if (docinfo.get('textURLPath',None)):	if (docinfo.get('textURLPath',None)):
page = self.getTextPage(docinfo=docinfo, pageinfo=pageinfo)	page = self.getTextPage(mode=viewMode, docinfo=docinfo, pageinfo=pageinfo)
pageinfo['textPage'] = page	pageinfo['textPage'] = page
tt = getattr(self, 'template')	tt = getattr(self, 'template')
pt = getattr(tt, 'viewer_main')	pt = getattr(tt, 'viewer_main')
Line 469 class documentViewer(Folder):	Line 499 class documentViewer(Folder):
docinfo = {}	docinfo = {}

for x in range(cut):	for x in range(cut):

path=getParentDir(path)	path=getParentDir(path)

infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path	infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path
Line 480 class documentViewer(Folder):	Line 509 class documentViewer(Folder):
if txt is None:	if txt is None:
raise IOError("Unable to get dir-info from %s"%(infoUrl))	raise IOError("Unable to get dir-info from %s"%(infoUrl))

dom = Parse(txt)	dom = ET.fromstring(txt)
sizes=dom.xpath("//dir/size")	#dom = Parse(txt)
logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes)	size=getText(dom.find("size"))
	#sizes=dom.xpath("//dir/size")
	logging.debug("documentViewer (getparamfromdigilib) dirInfo:size=%s"%size)

if sizes:	if size:
docinfo['numPages'] = int(getTextFromNode(sizes[0]))	docinfo['numPages'] = int(size)
else:	else:
docinfo['numPages'] = 0	docinfo['numPages'] = 0

Line 530 class documentViewer(Folder):	Line 561 class documentViewer(Folder):
if txt is None:	if txt is None:
raise IOError("Unable to read index meta from %s"%(url))	raise IOError("Unable to read index meta from %s"%(url))

dom = Parse(txt)	dom = ET.fromstring(txt)
	#dom = Parse(txt)
return dom	return dom

def getPresentationInfoXML(self, url):	def getPresentationInfoXML(self, url):
Line 549 class documentViewer(Folder):	Line 581 class documentViewer(Folder):
if txt is None:	if txt is None:
raise IOError("Unable to read infoXMLfrom %s"%(url))	raise IOError("Unable to read infoXMLfrom %s"%(url))

dom = Parse(txt)	dom = ET.fromstring(txt)
	#dom = Parse(txt)
return dom	return dom


Line 567 class documentViewer(Folder):	Line 600 class documentViewer(Folder):
path=getParentDir(path)	path=getParentDir(path)
dom = self.getDomFromIndexMeta(path)	dom = self.getDomFromIndexMeta(path)

acctype = dom.xpath("//access-conditions/access/@type")	acc = dom.find(".//access-conditions/access")
if acctype and (len(acctype)>0):	if acc is not None:
access=acctype[0].value	acctype = acc.get('type')
	#acctype = dom.xpath("//access-conditions/access/@type")
	if acctype:
	access=acctype
if access in ['group', 'institution']:	if access in ['group', 'institution']:
access = getTextFromNode(dom.xpath("//access-conditions/access/name")[0]).lower()	access = dom.find(".//access-conditions/access/name").text.lower()

docinfo['accessType'] = access	docinfo['accessType'] = access
return docinfo	return docinfo
Line 593 class documentViewer(Folder):	Line 629 class documentViewer(Folder):

logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))	logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))
# put in all raw bib fields as dict "bib"	# put in all raw bib fields as dict "bib"
bib = dom.xpath("//bib/*")	bib = dom.find(".//bib")
if bib and len(bib)>0:	#bib = dom.xpath("//bib/*")
	if bib is not None:
bibinfo = {}	bibinfo = {}
for e in bib:	for e in bib:
bibinfo[e.localName] = getTextFromNode(e)	bibinfo[e.tag] = getText(e)

docinfo['bib'] = bibinfo	docinfo['bib'] = bibinfo

# extract some fields (author, title, year) according to their mapping	# extract some fields (author, title, year) according to their mapping
metaData=self.metadata.main.meta.bib	metaData=self.metadata.main.meta.bib
bibtype=dom.xpath("//bib/@type")	bibtype=bib.get("type")
if bibtype and (len(bibtype)>0):	#bibtype=dom.xpath("//bib/@type")
bibtype=bibtype[0].value	if not bibtype:
else:
bibtype="generic"	bibtype="generic"

bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC)	bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC)
Line 614 class documentViewer(Folder):	Line 651 class documentViewer(Folder):
logging.debug("documentViewer (getbibinfofromindexmeta) bibmap:"+repr(bibmap))	logging.debug("documentViewer (getbibinfofromindexmeta) bibmap:"+repr(bibmap))
logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype))	logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype))
# if there is no mapping bibmap is empty (mapping sometimes has empty fields)	# if there is no mapping bibmap is empty (mapping sometimes has empty fields)
if len(bibmap) > 0 and len(bibmap['author'][0]) > 0:	if len(bibmap) > 0 and len(bibmap['author'][0]) > 0 or len(bibmap['title'][0]) > 0:
try:	try:
docinfo['author']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['author'][0])[0])	docinfo['author']=getText(bib.find(bibmap['author'][0]))
except: pass	except: pass
try:	try:
docinfo['title']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['title'][0])[0])	docinfo['title']=getText(bib.find(bibmap['title'][0]))
except: pass	except: pass
try:	try:
docinfo['year']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['year'][0])[0])	docinfo['year']=getText(bib.find(bibmap['year'][0]))
except: pass	except: pass
logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype)
try:	# ROC: why is this here?
docinfo['lang']=getTextFromNode(dom.xpath("//bib/lang")[0])	# logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype)
except:	# try:
docinfo['lang']=''	# docinfo['lang']=getTextFromNode(dom.find(".//bib/lang")[0])
try:	# except:
docinfo['city']=getTextFromNode(dom.xpath("//bib/city")[0])	# docinfo['lang']=''
except:	# try:
docinfo['city']=''	# docinfo['city']=getTextFromNode(dom.find(".//bib/city")[0])
try:	# except:
docinfo['number_of_pages']=getTextFromNode(dom.xpath("//bib/number_of_pages")[0])	# docinfo['city']=''
except:	# try:
docinfo['number_of_pages']=''	# docinfo['number_of_pages']=getTextFromNode(dom.find(".//bib/number_of_pages")[0])
try:	# except:
docinfo['series_volume']=getTextFromNode(dom.xpath("//bib/series_volume")[0])	# docinfo['number_of_pages']=''
except:	# try:
docinfo['series_volume']=''	# docinfo['series_volume']=getTextFromNode(dom.find(".//bib/series_volume")[0])
try:	# except:
docinfo['number_of_volumes']=getTextFromNode(dom.xpath("//bib/number_of_volumes")[0])	# docinfo['series_volume']=''
except:	# try:
docinfo['number_of_volumes']=''	# docinfo['number_of_volumes']=getTextFromNode(dom.find(".//bib/number_of_volumes")[0])
try:	# except:
docinfo['translator']=getTextFromNode(dom.xpath("//bib/translator")[0])	# docinfo['number_of_volumes']=''
except:	# try:
docinfo['translator']=''	# docinfo['translator']=getTextFromNode(dom.find(".//bib/translator")[0])
try:	# except:
docinfo['edition']=getTextFromNode(dom.xpath("//bib/edition")[0])	# docinfo['translator']=''
except:	# try:
docinfo['edition']=''	# docinfo['edition']=getTextFromNode(dom.find(".//bib/edition")[0])
try:	# except:
docinfo['series_author']=getTextFromNode(dom.xpath("//bib/series_author")[0])	# docinfo['edition']=''
except:	# try:
docinfo['series_author']=''	# docinfo['series_author']=getTextFromNode(dom.find(".//bib/series_author")[0])
try:	# except:
docinfo['publisher']=getTextFromNode(dom.xpath("//bib/publisher")[0])	# docinfo['series_author']=''
except:	# try:
docinfo['publisher']=''	# docinfo['publisher']=getTextFromNode(dom.find(".//bib/publisher")[0])
try:	# except:
docinfo['series_title']=getTextFromNode(dom.xpath("//bib/series_title")[0])	# docinfo['publisher']=''
except:	# try:
docinfo['series_title']=''	# docinfo['series_title']=getTextFromNode(dom.find(".//bib/series_title")[0])
try:	# except:
docinfo['isbn_issn']=getTextFromNode(dom.xpath("//bib/isbn_issn")[0])	# docinfo['series_title']=''
except:	# try:
docinfo['isbn_issn']=''	# docinfo['isbn_issn']=getTextFromNode(dom.find(".//bib/isbn_issn")[0])
	# except:
	# docinfo['isbn_issn']=''
return docinfo	return docinfo


	# TODO: is this needed?
def getNameFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):	def getNameFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):
"""gets name info from the index.meta file at path or given by dom"""	"""gets name info from the index.meta file at path or given by dom"""
if docinfo is None:	if docinfo is None:
Line 682 class documentViewer(Folder):	Line 722 class documentViewer(Folder):
path=getParentDir(path)	path=getParentDir(path)
dom = self.getDomFromIndexMeta(path)	dom = self.getDomFromIndexMeta(path)

docinfo['name']=getTextFromNode(dom.xpath("/resource/name")[0])	docinfo['name']=getText(dom.find("name"))
logging.debug("documentViewer docinfo[name] %s"%docinfo['name'])	logging.debug("documentViewer docinfo[name] %s"%docinfo['name'])
return docinfo	return docinfo

Line 699 class documentViewer(Folder):	Line 739 class documentViewer(Folder):
archivePath = None	archivePath = None
archiveName = None	archiveName = None

archiveNames = dom.xpath("//resource/name")	archiveName = getText(dom.find("name"))
if archiveNames and (len(archiveNames) > 0):	if not archiveName:
archiveName = getTextFromNode(archiveNames[0])
else:
logging.warning("documentViewer (getdocinfofromtexttool) resource/name missing in: %s" % (url))	logging.warning("documentViewer (getdocinfofromtexttool) resource/name missing in: %s" % (url))

archivePaths = dom.xpath("//resource/archive-path")	archivePath = getText(dom.find("archive-path"))
if archivePaths and (len(archivePaths) > 0):	if archivePath:
archivePath = getTextFromNode(archivePaths[0])
# clean up archive path	# clean up archive path
if archivePath[0] != '/':	if archivePath[0] != '/':
archivePath = '/' + archivePath	archivePath = '/' + archivePath
Line 723 class documentViewer(Folder):	Line 760 class documentViewer(Folder):
# we balk without archive-path	# we balk without archive-path
raise IOError("Missing archive-path (for text-tool) in %s" % (url))	raise IOError("Missing archive-path (for text-tool) in %s" % (url))

imageDirs = dom.xpath("//texttool/image")	imageDir = getText(dom.find(".//texttool/image"))
if imageDirs and (len(imageDirs) > 0):
imageDir = getTextFromNode(imageDirs[0])

else:	if not imageDir:
# we balk with no image tag / not necessary anymore because textmode is now standard	# we balk with no image tag / not necessary anymore because textmode is now standard
#raise IOError("No text-tool info in %s"%(url))	#raise IOError("No text-tool info in %s"%(url))
imageDir = ""	imageDir = ""
Line 744 class documentViewer(Folder):	Line 779 class documentViewer(Folder):

docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imageDir	docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imageDir

viewerUrls = dom.xpath("//texttool/digiliburlprefix")	viewerUrl = getText(dom.find(".//texttool/digiliburlprefix"))
if viewerUrls and (len(viewerUrls) > 0):	if viewerUrl:
viewerUrl = getTextFromNode(viewerUrls[0])
docinfo['viewerURL'] = viewerUrl	docinfo['viewerURL'] = viewerUrl

# old style text URL	# old style text URL
textUrls = dom.xpath("//texttool/text")	textUrl = getText(dom.find(".//texttool/text"))
if textUrls and (len(textUrls) > 0):	if textUrl:
textUrl = getTextFromNode(textUrls[0])
if urlparse.urlparse(textUrl)[0] == "": #keine url	if urlparse.urlparse(textUrl)[0] == "": #keine url
textUrl = os.path.join(archivePath, textUrl)	textUrl = os.path.join(archivePath, textUrl)
# fix URLs starting with /mpiwg/online	# fix URLs starting with /mpiwg/online
Line 762 class documentViewer(Folder):	Line 795 class documentViewer(Folder):
docinfo['textURL'] = textUrl	docinfo['textURL'] = textUrl

# new style text-url-path	# new style text-url-path
textUrls = dom.xpath("//texttool/text-url-path")	textUrl = getText(dom.find(".//texttool/text-url-path"))
if textUrls and (len(textUrls) > 0):	if textUrl:
textUrl = getTextFromNode(textUrls[0])
docinfo['textURLPath'] = textUrl	docinfo['textURLPath'] = textUrl
textUrlkurz = string.split(textUrl, ".")[0]	textUrlkurz = string.split(textUrl, ".")[0]
docinfo['textURLPathkurz'] = textUrlkurz	docinfo['textURLPathkurz'] = textUrlkurz
Line 773 class documentViewer(Folder):	Line 805 class documentViewer(Folder):
#docinfo = self.getNumTextPages(docinfo)	#docinfo = self.getNumTextPages(docinfo)


presentationUrls = dom.xpath("//texttool/presentation")	presentationUrl = getText(dom.find(".//texttool/presentation"))
docinfo = self.getBibinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get info von bib tag	docinfo = self.getBibinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get info von bib tag
	# TODO: is this needed here?
docinfo = self.getNameFromIndexMeta(url, docinfo=docinfo, dom=dom)	docinfo = self.getNameFromIndexMeta(url, docinfo=docinfo, dom=dom)


if presentationUrls and (len(presentationUrls) > 0): # ueberschreibe diese durch presentation informationen	if presentationUrl: # ueberschreibe diese durch presentation informationen
# presentation url ergiebt sich ersetzen von index.meta in der url der fuer die Metadaten	# presentation url ergiebt sich ersetzen von index.meta in der url der fuer die Metadaten
# durch den relativen Pfad auf die presentation infos	# durch den relativen Pfad auf die presentation infos
presentationPath = getTextFromNode(presentationUrls[0])	presentationPath = presentationUrl
if url.endswith("index.meta"):	if url.endswith("index.meta"):
presentationUrl = url.replace('index.meta', presentationPath)	presentationUrl = url.replace('index.meta', presentationPath)
else:	else:
Line 798 class documentViewer(Folder):	Line 831 class documentViewer(Folder):
"""gets the bibliographical information from the preseantion entry in texttools	"""gets the bibliographical information from the preseantion entry in texttools
"""	"""
dom=self.getPresentationInfoXML(url)	dom=self.getPresentationInfoXML(url)
try:	docinfo['author']=getText(dom.find(".//author"))
docinfo['author']=getTextFromNode(dom.xpath("//author")[0])	docinfo['title']=getText(dom.find(".//title"))
except:	docinfo['year']=getText(dom.find(".//date"))
pass
try:
docinfo['title']=getTextFromNode(dom.xpath("//title")[0])
except:
pass
try:
docinfo['year']=getTextFromNode(dom.xpath("//date")[0])
except:
pass
return docinfo	return docinfo

def getDocinfoFromImagePath(self,path,docinfo=None,cut=0):	def getDocinfoFromImagePath(self,path,docinfo=None,cut=0):
Line 890 class documentViewer(Folder):	Line 914 class documentViewer(Folder):
pageinfo['viewMode'] = viewMode	pageinfo['viewMode'] = viewMode
pageinfo['tocMode'] = tocMode	pageinfo['tocMode'] = tocMode
pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')	pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')
pageinfo['optionToggle'] = self.REQUEST.get('optionToggle','')	#pageinfo['optionToggle'] = self.REQUEST.get('optionToggle','1')
pageinfo['query'] = self.REQUEST.get('query','')	pageinfo['query'] = self.REQUEST.get('query','')
pageinfo['queryType'] = self.REQUEST.get('queryType','')	pageinfo['queryType'] = self.REQUEST.get('queryType','')
pageinfo['querySearch'] =self.REQUEST.get('querySearch', 'fulltext')	pageinfo['querySearch'] =self.REQUEST.get('querySearch', 'fulltext')

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.170
changed lines
	Added in v.1.175.2.2