Annotation of documentViewer/extraFunction.py, revision 1.1.2.3
1.1.2.1 abukhman 1:
2: from OFS.Folder import Folder
3: from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
4: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
5: from Products.PythonScripts.standard import url_quote
6:
7:
8: from Ft.Xml.Domlette import NonvalidatingReader
9: from Ft.Xml.Domlette import PrettyPrint, Print
10: from Ft.Xml import EMPTY_NAMESPACE, Parse
11:
12: from xml.dom.minidom import parse, parseString
13:
14: import Ft.Xml.XPath
15: import cStringIO
16: import xmlrpclib
17: import os.path
18: import sys
19: import cgi
20: import urllib
21: import logging
22: import math
23: import documentViewer
1.1.2.2 abukhman 24: import urllib2
25: import urllib
1.1.2.1 abukhman 26: import urlparse
27: from types import *
28:
29: def getTextFromNode(nodename):
30: "get the cdata content of a node"
31: if nodename is None:
32: return ""
33: nodelist=nodename.childNodes
34: rc = ""
35: for node in nodelist:
36: if node.nodeType == node.TEXT_NODE:
37: rc = rc + node.data
38: return rc
39:
40: def serializeNode(node, encoding='utf-8'):
41: "returns a string containing node as XML"
42: buf = cStringIO.StringIO()
43: Print(node, stream=buf, encoding=encoding)
44: s = buf.getvalue()
45: buf.close()
46: return s
47:
48:
49: class extraFunction(Folder):
50:
51:
52: def __init__(self,id, title=""):
53:
54: self.id=id
55: self.title=title
1.1.2.3 ! casties 56:
! 57: def getHttpData(self, url, data=None, num_tries=3, timeout=40):
! 58: """returns result from url+data HTTP request"""
! 59: # we do GET (by appending data to url)
! 60: if isinstance(data, str) or isinstance(data, unicode):
! 61: # if data is string then append
! 62: url = "%s?%s"%(url,data)
! 63: else:
! 64: # we assume its a dict
! 65: url = "%s?%s"%(url,urllib.urlencode(data))
! 66:
! 67: response = None
! 68: errmsg = None
! 69: for cnt in range(num_tries):
! 70: try:
! 71: logging.debug("getHttpData(%s) url=%s"%(cnt+1,url))
! 72: if sys.version_info < (2, 6):
! 73: # set timeout on socket -- ugly :-(
! 74: import socket
! 75: socket.setdefaulttimeout(timeout)
! 76: response = urllib2.urlopen(url)
! 77: else:
! 78: response = urllib2.urlopen(url,timeout=timeout)
! 79: # check result?
! 80: break
! 81: except urllib2.HTTPError, e:
! 82: logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
! 83: errmsg = str(e)
! 84: # stop trying
! 85: break
! 86: except urllib2.URLError, e:
! 87: logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
! 88: errmsg = str(e)
! 89: # stop trying
! 90: #break
! 91:
! 92: if response is not None:
! 93: data = response.read()
! 94: response.close()
! 95: return data
! 96:
! 97: raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
! 98: #return None
! 99:
! 100:
1.1.2.1 abukhman 101:
102: def getSearch(self, pn=1, pageinfo=None, docinfo=None, query=None, queryType=None, lemma=None):
103: """get search list"""
104: docpath = docinfo['textURLPath']
105: url = docinfo['url']
106: logging.debug("documentViewer (gettoc) docpath: %s"%(docpath))
107: logging.debug("documentViewer (gettoc) url: %s"%(url))
108: pagesize = pageinfo['queryPageSize']
109: pn = pageinfo['searchPN']
110: sn = pageinfo['sn']
111: highlightQuery = pageinfo['highlightQuery']
112: query =pageinfo['query']
113: queryType =pageinfo['queryType']
114: viewMode= pageinfo['viewMode']
115: tocMode = pageinfo['tocMode']
116: tocPN = pageinfo['tocPN']
117: selfurl = self.absolute_url()
1.1.2.2 abukhman 118:
1.1.2.3 ! casties 119: data = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&highlightQuery=%s"%(docpath, 'text', queryType, query, pagesize, pn, sn, viewMode,highlightQuery))
1.1.2.2 abukhman 120: #page=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&highlightQuery=%s"%(docpath, 'text', queryType, query, pagesize, pn, sn, viewMode,highlightQuery) ,outputUnicode=False)
1.1.2.3 ! casties 121: #data = page.read()
! 122: #page.close()
1.1.2.2 abukhman 123:
124: pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url)
1.1.2.1 abukhman 125: pagedom = Parse(pagexml)
126: if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"):
127: pagedivs = pagedom.xpath("//div[@class='queryResultPage']")
128: if len(pagedivs)>0:
129: pagenode=pagedivs[0]
130: links=pagenode.xpath("//a")
131: for l in links:
132: hrefNode = l.getAttributeNodeNS(None, u"href")
133: if hrefNode:
134: href = hrefNode.nodeValue
135: if href.startswith('page-fragment.xql'):
136: selfurl = self.absolute_url()
137: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s'%(viewMode,queryType,query,pagesize,pn,tocMode,pn,tocPN))
138: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
139: return serializeNode(pagenode)
140: if (queryType=="fulltextMorph"):
141: pagedivs = pagedom.xpath("//div[@class='queryResult']")
142: if len(pagedivs)>0:
143: pagenode=pagedivs[0]
144: links=pagenode.xpath("//a")
145: for l in links:
146: hrefNode = l.getAttributeNodeNS(None, u"href")
147: if hrefNode:
148: href = hrefNode.nodeValue
149: if href.startswith('page-fragment.xql'):
150: selfurl = self.absolute_url()
151: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s'%(viewMode,queryType,query,pagesize,pn,tocMode,pn,tocPN))
152: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
153: if href.startswith('../lt/lemma.xql'):
154: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma_New'%(selfurl))
155: l.setAttributeNS(None, 'target', '_blank')
156: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
157: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
158: pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']")
159: return serializeNode(pagenode)
160: if (queryType=="ftIndex")or(queryType=="ftIndexMorph"):
161: pagedivs= pagedom.xpath("//div[@class='queryResultPage']")
162: if len(pagedivs)>0:
163: pagenode=pagedivs[0]
164: links=pagenode.xpath("//a")
165: for l in links:
166: hrefNode = l.getAttributeNodeNS(None, u"href")
167: if hrefNode:
168: href = hrefNode.nodeValue
169: hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s'%(viewMode,tocMode,tocPN,pn))
170: if href.startswith('../lt/lex.xql'):
171: hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_voc'%selfurl)
172: l.setAttributeNS(None, 'target', '_blank')
173: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
174: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
175: if href.startswith('../lt/lemma.xql'):
176: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%selfurl)
177: l.setAttributeNS(None, 'target', '_blank')
178: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
179: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
180: return serializeNode(pagenode)
181: return "no text here"
182:
183: def getNumPages(self,docinfo=None):
184: """get list of pages from fulltext and put in docinfo"""
185: xquery = '//pb'
1.1.2.3 ! casties 186: text = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/xquery.xql","document=%s&xquery=%s"%(docinfo['textURLPath'],xquery))
1.1.2.2 abukhman 187: #text = self.template.fulltextclient.eval("/mpdl/interface/xquery.xql", "document=%s&xquery=%s"%(docinfo['textURLPath'],xquery))
1.1.2.1 abukhman 188: docinfo['numPages'] = text.count("<pb ")
189: return docinfo
190:
191: def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None, highlightQuery=None,sn=None, viewMode=None, tocMode=None, tocPN=None):
192: """returns single page from fulltext"""
193: docpath = docinfo['textURLPath']
194: path = docinfo['textURLPath']
195: url = docinfo['url']
196: viewMode= pageinfo['viewMode']
197: tocMode = pageinfo['tocMode']
198: tocPN = pageinfo['tocPN']
199: selfurl = self.absolute_url()
200: if mode == "text_dict":
201: textmode = "textPollux"
202: else:
203: textmode = mode
204:
205: textParam = "document=%s&mode=%s&pn=%s"%(docpath,textmode,pn)
206: if highlightQuery is not None:
207: textParam +="&highlightQuery=%s&sn=%s"%(highlightQuery,sn)
1.1.2.2 abukhman 208:
1.1.2.3 ! casties 209: pagexml = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/page-fragment.xql",textParam)
1.1.2.2 abukhman 210: """pagexml=self.template.fulltextclient.eval("/mpdl/interface/page-fragment.xql", textParam, outputUnicode=False)"""
211:
1.1.2.1 abukhman 212: pagedom = Parse(pagexml)
213: # plain text mode
214: if mode == "text":
215: # first div contains text
216: pagedivs = pagedom.xpath("/div")
217: if len(pagedivs) > 0:
218: pagenode = pagedivs[0]
219: links = pagenode.xpath("//a")
220: for l in links:
221: hrefNode = l.getAttributeNodeNS(None, u"href")
222: if hrefNode:
223: href= hrefNode.nodeValue
224: if href.startswith('#note-'):
225: hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,tocPN,pn))
226: return serializeNode(pagenode)
227: if mode == "xml":
228: # first div contains text
229: pagedivs = pagedom.xpath("/div")
230: if len(pagedivs) > 0:
231: pagenode = pagedivs[0]
232: return serializeNode(pagenode)
233: if mode == "pureXml":
234: # first div contains text
235: pagedivs = pagedom.xpath("/div")
236: if len(pagedivs) > 0:
237: pagenode = pagedivs[0]
238: return serializeNode(pagenode)
239: # text-with-links mode
240: if mode == "text_dict":
241: # first div contains text
242: pagedivs = pagedom.xpath("/div")
243: if len(pagedivs) > 0:
244: pagenode = pagedivs[0]
245: # check all a-tags
246: links = pagenode.xpath("//a")
247: for l in links:
248: hrefNode = l.getAttributeNodeNS(None, u"href")
249: if hrefNode:
250: # is link with href
251: href = hrefNode.nodeValue
252: if href.startswith('lt/lex.xql'):
253: # is pollux link
254: selfurl = self.absolute_url()
255: # change href
256: hrefNode.nodeValue = href.replace('lt/lex.xql','%s/template/head_main_voc'%selfurl)
257: # add target
258: l.setAttributeNS(None, 'target', '_blank')
259: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;")
260: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
261: if href.startswith('lt/lemma.xql'):
262: selfurl = self.absolute_url()
263: hrefNode.nodeValue = href.replace('lt/lemma.xql','%s/template/head_main_lemma'%selfurl)
264: l.setAttributeNS(None, 'target', '_blank')
265: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;")
266: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
267: if href.startswith('#note-'):
268: hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,tocPN,pn))
269: return serializeNode(pagenode)
270: return "no text here"
271:
272: def getTranslate(self, query=None, language=None):
273: """translate into another languages"""
1.1.2.3 ! casties 274: data = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query)))
1.1.2.2 abukhman 275: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query)))
1.1.2.3 ! casties 276: #data = pagexml.read()
! 277: #pagexml.close()
1.1.2.2 abukhman 278: return data
1.1.2.1 abukhman 279:
280: def getLemma(self, lemma=None, language=None):
281: """simular words lemma """
1.1.2.3 ! casties 282: data = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(lemma)))
1.1.2.2 abukhman 283: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(lemma)))
1.1.2.3 ! casties 284: #data = pagexml.read()
! 285: #pagexml.close()
1.1.2.2 abukhman 286: return data
1.1.2.1 abukhman 287:
288: def getLemmaNew(self, query=None, language=None):
289: """simular words lemma """
1.1.2.3 ! casties 290: data = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(query)))
1.1.2.2 abukhman 291: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(query)))
1.1.2.3 ! casties 292: #data = pagexml.read()
! 293: #pagexml.close()
1.1.2.2 abukhman 294: return data
1.1.2.1 abukhman 295:
296: def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1):
297: """number of"""
298: docpath = docinfo['textURLPath']
299: pagesize = pageinfo['queryPageSize']
300: pn = pageinfo['searchPN']
301: query =pageinfo['query']
302: queryType =pageinfo['queryType']
303: tocSearch = 0
304: tocDiv = None
1.1.2.2 abukhman 305:
1.1.2.3 ! casties 306: pagexml = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn))
1.1.2.2 abukhman 307: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn) ,outputUnicode=False)
1.1.2.1 abukhman 308: pagedom = Parse(pagexml)
309: numdivs = pagedom.xpath("//div[@class='queryResultHits']")
310: tocSearch = int(getTextFromNode(numdivs[0]))
311: tc=int((tocSearch/10)+1)
312: logging.debug("documentViewer (gettoc) tc: %s"%(tc))
313: return tc
314:
315: def getToc(self, mode="text", docinfo=None):
316: """loads table of contents and stores in docinfo"""
317: logging.debug("documentViewer (gettoc) mode: %s"%(mode))
318: if mode == "none":
319: return docinfo
320: if 'tocSize_%s'%mode in docinfo:
321: # cached toc
322: return docinfo
323:
324: docpath = docinfo['textURLPath']
325: # we need to set a result set size
326: pagesize = 1000
327: pn = 1
328: if mode == "text":
329: queryType = "toc"
330: else:
331: queryType = mode
332: # number of entries in toc
333: tocSize = 0
334: tocDiv = None
1.1.2.2 abukhman 335:
1.1.2.3 ! casties 336: pagexml = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
1.1.2.2 abukhman 337: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql", "document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType,pagesize,pn), outputUnicode=False)
1.1.2.1 abukhman 338: # post-processing downloaded xml
339: pagedom = Parse(pagexml)
340: # get number of entries
341: numdivs = pagedom.xpath("//div[@class='queryResultHits']")
342: if len(numdivs) > 0:
343: tocSize = int(getTextFromNode(numdivs[0]))
344: docinfo['tocSize_%s'%mode] = tocSize
345: return docinfo
346:
347: def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None):
348: """returns single page from the table of contents"""
349: # TODO: this should use the cached TOC
350: if mode == "text":
351: queryType = "toc"
352: else:
353: queryType = mode
354: docpath = docinfo['textURLPath']
355: path = docinfo['textURLPath']
356: pagesize = pageinfo['tocPageSize']
357: pn = pageinfo['tocPN']
358: url = docinfo['url']
359: selfurl = self.absolute_url()
360: viewMode= pageinfo['viewMode']
361: tocMode = pageinfo['tocMode']
1.1.2.2 abukhman 362: tocPN = pageinfo['tocPN']
363:
1.1.2.3 ! casties 364: data = self.getHttpData("http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
! 365: #data = pagexml.read()
! 366: #pagexml.close()
1.1.2.2 abukhman 367:
368: page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN))
1.1.2.1 abukhman 369: text = page.replace('mode=image','mode=texttool')
370: return text
371:
372:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>