1:
2: from OFS.SimpleItem import SimpleItem
3: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
4:
5: from Ft.Xml import EMPTY_NAMESPACE, Parse
6: from Ft.Xml.Domlette import NonvalidatingReader
7: import Ft.Xml.Domlette
8: import cStringIO
9:
10: import xml.etree.ElementTree as ET
11:
12: import re
13: import logging
14: import urllib
15:
16: from SrvTxtUtils import getInt, getText, getHttpData
17:
18: def serialize(node):
19: """returns a string containing an XML snippet of node"""
20: s = ET.tostring(node, 'UTF-8')
21: # snip off XML declaration
22: if s.startswith('<?xml'):
23: i = s.find('?>')
24: return s[i+3:]
25:
26: return s
27:
28:
29: def getTextFromNode(node):
30: """get the cdata content of a node"""
31: if node is None:
32: return ""
33: # ET:
34: # text = node.text or ""
35: # for e in node:
36: # text += gettext(e)
37: # if e.tail:
38: # text += e.tail
39:
40: # 4Suite:
41: nodelist=node.childNodes
42: text = ""
43: for n in nodelist:
44: if n.nodeType == node.TEXT_NODE:
45: text = text + n.data
46:
47: return text
48:
49: def serializeNode(node, encoding="utf-8"):
50: """returns a string containing node as XML"""
51: #s = ET.tostring(node)
52:
53: # 4Suite:
54: stream = cStringIO.StringIO()
55: Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding)
56: s = stream.getvalue()
57: stream.close()
58:
59: return s
60:
61:
62: class MpdlXmlTextServer(SimpleItem):
63: """TextServer implementation for MPDL-XML eXist server"""
64: meta_type="MPDL-XML TextServer"
65:
66: manage_options=(
67: {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
68: )+SimpleItem.manage_options
69:
70: manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
71:
72: def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
73: """constructor"""
74: self.id=id
75: self.title=title
76: self.timeout = timeout
77: if serverName is None:
78: self.serverUrl = serverUrl
79: else:
80: self.serverUrl = "http://%s/mpdl/interface/"%serverName
81:
82: def getHttpData(self, url, data=None):
83: """returns result from url+data HTTP request"""
84: return getHttpData(url,data,timeout=self.timeout)
85:
86: def getServerData(self, method, data=None):
87: """returns result from text server for method+data"""
88: url = self.serverUrl+method
89: return getHttpData(url,data,timeout=self.timeout)
90:
91: # WTF: what does this really do? can it be integrated in getPage?
92: def getSearch(self, pageinfo=None, docinfo=None):
93: """get search list"""
94: logging.debug("getSearch()")
95: docpath = docinfo['textURLPath']
96: url = docinfo['url']
97: pagesize = pageinfo['queryPageSize']
98: pn = pageinfo.get('searchPN',1)
99: sn = pageinfo['sn']
100: highlightQuery = pageinfo['highlightQuery']
101: query =pageinfo['query']
102: queryType =pageinfo['queryType']
103: viewMode= pageinfo['viewMode']
104: tocMode = pageinfo['tocMode']
105: characterNormalization = pageinfo['characterNormalization']
106: #optionToggle = pageinfo['optionToggle']
107: tocPN = pageinfo['tocPN']
108: selfurl = self.absolute_url()
109: data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery)))
110: pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url)
111: pagedom = Parse(pagexml)
112:
113: """
114: pagedivs = pagedom.xpath("//div[@class='queryResultHits']")
115: if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")):
116: if len(pagedivs)>0:
117: docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0]))
118: s = getTextFromNode(pagedivs[0])
119: s1 = int(s)/10+1
120: try:
121: docinfo['queryResultHits'] = int(s1)
122: logging.debug("SEARCH ENTRIES: %s"%(s1))
123: except:
124: docinfo['queryResultHits'] = 0
125: """
126: if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"):
127: pagedivs = pagedom.xpath("//div[@class='queryResultPage']")
128: if len(pagedivs)>0:
129: pagenode=pagedivs[0]
130: links=pagenode.xpath("//a")
131: for l in links:
132: hrefNode = l.getAttributeNodeNS(None, u"href")
133: if hrefNode:
134: href = hrefNode.nodeValue
135: if href.startswith('page-fragment.xql'):
136: selfurl = self.absolute_url()
137: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization))
138: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
139: #logging.debug("PUREXML :%s"%(serializeNode(pagenode)))
140: return serializeNode(pagenode)
141: if (queryType=="fulltextMorph"):
142: pagedivs = pagedom.xpath("//div[@class='queryResult']")
143: if len(pagedivs)>0:
144: pagenode=pagedivs[0]
145: links=pagenode.xpath("//a")
146: for l in links:
147: hrefNode = l.getAttributeNodeNS(None, u"href")
148: if hrefNode:
149: href = hrefNode.nodeValue
150: if href.startswith('page-fragment.xql'):
151: selfurl = self.absolute_url()
152: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization))
153: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
154: if href.startswith('../lt/lemma.xql'):
155: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl))
156: l.setAttributeNS(None, 'target', '_blank')
157: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
158: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
159: pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']")
160: return serializeNode(pagenode)
161: if (queryType=="ftIndex")or(queryType=="ftIndexMorph"):
162: pagedivs= pagedom.xpath("//div[@class='queryResultPage']")
163: if len(pagedivs)>0:
164: pagenode=pagedivs[0]
165: links=pagenode.xpath("//a")
166: for l in links:
167: hrefNode = l.getAttributeNodeNS(None, u"href")
168: if hrefNode:
169: href = hrefNode.nodeValue
170: hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization))
171: if href.startswith('../lt/lex.xql'):
172: hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl)
173: l.setAttributeNS(None, 'target', '_blank')
174: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
175: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
176: if href.startswith('../lt/lemma.xql'):
177: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl))
178: l.setAttributeNS(None, 'target', '_blank')
179: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
180: l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
181: return serializeNode(pagenode)
182: return "no text here"
183:
184: def getGisPlaces(self, docinfo=None, pageinfo=None):
185: """ Show all Gis Places of whole Page"""
186: xpath='//place'
187: docpath = docinfo.get('textURLPath',None)
188: if not docpath:
189: return None
190:
191: url = docinfo['url']
192: selfurl = self.absolute_url()
193: pn = pageinfo['current']
194: hrefList=[]
195: myList= ""
196: text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn))
197: dom = ET.fromstring(text)
198: result = dom.findall(".//result/resultPage/place")
199: for l in result:
200: href = l.get("id")
201: hrefList.append(href)
202: # WTF: what does this do?
203: myList = ",".join(hrefList)
204: #logging.debug("getGisPlaces :%s"%(myList))
205: return myList
206:
207: def getAllGisPlaces (self, docinfo=None, pageinfo=None):
208: """Show all Gis Places of whole Book """
209: xpath ='//echo:place'
210: docpath =docinfo['textURLPath']
211: url = docinfo['url']
212: selfurl =self.absolute_url()
213: pn =pageinfo['current']
214: hrefList=[]
215: myList=""
216: text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath))
217: dom = ET.fromstring(text)
218: result = dom.findall(".//result/resultPage/place")
219:
220: for l in result:
221: href = l.get("id")
222: hrefList.append(href)
223: # WTF: what does this do?
224: myList = ",".join(hrefList)
225: #logging.debug("getALLGisPlaces :%s"%(myList))
226: return myList
227:
228: def processPageInfo(self, dom, docinfo, pageinfo):
229: """processes page info divs from dom and stores in docinfo and pageinfo"""
230: # process all toplevel divs
231: alldivs = dom.findall(".//div")
232: pagediv = None
233: for div in alldivs:
234: dc = div.get('class')
235:
236: # page content div
237: if dc == 'pageContent':
238: pagediv = div
239:
240: # pageNumberOrig
241: elif dc == 'pageNumberOrig':
242: pageinfo['pageNumberOrig'] = div.text
243:
244: # pageNumberOrigNorm
245: elif dc == 'pageNumberOrigNorm':
246: pageinfo['pageNumberOrigNorm'] = div.text
247:
248: # pageNumberOrigNorm
249: elif dc == 'countFigureEntries':
250: docinfo['countFigureEntries'] = getInt(div.text)
251:
252: # pageNumberOrigNorm
253: elif dc == 'countTocEntries':
254: # WTF: s1 = int(s)/30+1
255: docinfo['countTocEntries'] = getInt(div.text)
256:
257: # numTextPages
258: elif dc == 'countPages':
259: np = getInt(div.text)
260: if np > 0:
261: docinfo['numTextPages'] = np
262: if docinfo.get('numPages', 0) == 0:
263: # seems to be text-only - update page count
264: docinfo['numPages'] = np
265: pageinfo['end'] = min(pageinfo['end'], np)
266: pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
267: if np % pageinfo['groupsize'] > 0:
268: pageinfo['numgroups'] += 1
269:
270: return
271:
272:
273: def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None):
274: """returns single page from fulltext"""
275: logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
276: # check for cached text -- but this shouldn't be called twice
277: if pageinfo.has_key('textPage'):
278: logging.debug("getTextPage: using cached text")
279: return pageinfo['textPage']
280:
281: docpath = docinfo['textURLPath']
282: # just checking
283: if pageinfo['current'] != pn:
284: logging.warning("getTextPage: current!=pn!")
285:
286: # stuff for constructing full urls
287: url = docinfo['url']
288: urlmode = docinfo['mode']
289: sn = pageinfo.get('sn', None)
290: highlightQuery = pageinfo.get('highlightQuery', None)
291: tocMode = pageinfo.get('tocMode', None)
292: tocPN = pageinfo.get('tocPN',None)
293: characterNormalization = pageinfo.get('characterNormalization', None)
294: selfurl = docinfo['viewerUrl']
295:
296: if mode == "text_dict":
297: # text_dict is called textPollux in the backend
298: textmode = "textPollux"
299: else:
300: textmode = mode
301:
302: textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization)
303: if highlightQuery:
304: textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn)
305:
306: # fetch the page
307: pagexml = self.getServerData("page-fragment.xql",textParam)
308: dom = ET.fromstring(pagexml)
309: # extract additional info
310: self.processPageInfo(dom, docinfo, pageinfo)
311: # page content is in <div class="pageContent">
312: pagediv = None
313: # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
314: alldivs = dom.findall(".//div")
315: for div in alldivs:
316: dc = div.get('class')
317: # page content div
318: if dc == 'pageContent':
319: pagediv = div
320: break
321:
322: # plain text mode
323: if mode == "text":
324: if pagediv:
325: links = pagediv.findall(".//a")
326: for l in links:
327: href = l.get('href')
328: if href and href.startswith('#note-'):
329: href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))
330: l.set('href', href)
331:
332: return serialize(pagediv)
333:
334: # text-with-links mode
335: elif mode == "text_dict":
336: if pagediv:
337: # check all a-tags
338: links = pagediv.findall(".//a")
339: for l in links:
340: href = l.get('href')
341:
342: if href:
343: # is link with href
344: if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'):
345: # is pollux link
346: selfurl = self.absolute_url()
347: # change href
348: l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl))
349: # add target
350: l.set('target', '_blank')
351:
352: if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
353: selfurl = self.absolute_url()
354: l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
355: l.set('target', '_blank')
356: l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
357: l.set('ondblclick', 'popupWin.focus();')
358:
359: if href.startswith('#note-'):
360: l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)))
361:
362: return serialize(pagediv)
363:
364: # xml mode
365: elif mode == "xml":
366: if pagediv:
367: return serialize(pagediv)
368:
369: # pureXml mode
370: elif mode == "pureXml":
371: if pagediv:
372: return serialize(pagediv)
373:
374: # gis mode
375: elif mode == "gis":
376: name = docinfo['name']
377: if pagediv:
378: # check all a-tags
379: links = pagediv.findall(".//a")
380: for l in links:
381: href = l.get('href')
382: if href:
383: if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'):
384: l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name))
385: l.set('target', '_blank')
386:
387: return serialize(pagediv)
388:
389: return "no text here"
390:
391: # WTF: is this needed?
392: def getOrigPages(self, docinfo=None, pageinfo=None):
393: logging.debug("CALLED: getOrigPages!")
394: if not pageinfo.has_key('pageNumberOrig'):
395: logging.warning("getOrigPages: not in pageinfo!")
396: return None
397:
398: return pageinfo['pageNumberOrig']
399:
400: # WTF: is this needed?
401: def getOrigPagesNorm(self, docinfo=None, pageinfo=None):
402: logging.debug("CALLED: getOrigPagesNorm!")
403: if not pageinfo.has_key('pageNumberOrigNorm'):
404: logging.warning("getOrigPagesNorm: not in pageinfo!")
405: return None
406:
407: return pageinfo['pageNumberOrigNorm']
408:
409: # TODO: should be getWordInfo
410: def getTranslate(self, word=None, language=None):
411: """translate into another languages"""
412: data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html")
413: return data
414:
415: # WTF: what does this do?
416: def getLemma(self, lemma=None, language=None):
417: """simular words lemma """
418: data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html")
419: return data
420:
421: # WTF: what does this do?
422: def getLemmaQuery(self, query=None, language=None):
423: """simular words lemma """
424: data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html")
425: return data
426:
427: # WTF: what does this do?
428: def getLex(self, query=None, language=None):
429: #simular words lemma
430: data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query))
431: return data
432:
433: # WTF: what does this do?
434: def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1):
435: #number of
436: docpath = docinfo['textURLPath']
437: pagesize = pageinfo['queryPageSize']
438: pn = pageinfo['searchPN']
439: query =pageinfo['query']
440: queryType =pageinfo['queryType']
441: tocSearch = 0
442: tocDiv = None
443:
444: pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn))
445: pagedom = Parse(pagexml)
446: numdivs = pagedom.xpath("//div[@class='queryResultHits']")
447: tocSearch = int(getTextFromNode(numdivs[0]))
448: tc=int((tocSearch/10)+1)
449: return tc
450:
451: def getToc(self, mode="text", docinfo=None):
452: """loads table of contents and stores XML in docinfo"""
453: logging.debug("getToc mode=%s"%mode)
454: if mode == "none":
455: return docinfo
456:
457: if 'tocSize_%s'%mode in docinfo:
458: # cached toc
459: return docinfo
460:
461: docpath = docinfo['textURLPath']
462: # we need to set a result set size
463: pagesize = 1000
464: pn = 1
465: if mode == "text":
466: queryType = "toc"
467: else:
468: queryType = mode
469: # number of entries in toc
470: tocSize = 0
471: tocDiv = None
472: # fetch full toc
473: pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
474: dom = ET.fromstring(pagexml)
475: # page content is in <div class="queryResultPage">
476: pagediv = None
477: # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
478: alldivs = dom.findall("div")
479: for div in alldivs:
480: dc = div.get('class')
481: # page content div
482: if dc == 'queryResultPage':
483: pagediv = div
484:
485: elif dc == 'queryResultHits':
486: docinfo['tocSize_%s'%mode] = getInt(div.text)
487:
488: if pagediv:
489: # store XML in docinfo
490: docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
491:
492: return docinfo
493:
494: def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None):
495: """returns single page from the table of contents"""
496: logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
497: if mode == "text":
498: queryType = "toc"
499: else:
500: queryType = mode
501:
502: # check for cached TOC
503: if not docinfo.has_key('tocXML_%s'%mode):
504: self.getToc(mode=mode, docinfo=docinfo)
505:
506: tocxml = docinfo.get('tocXML_%s'%mode, None)
507: if not tocxml:
508: logging.error("getTocPage: unable to find tocXML")
509: return "No ToC"
510:
511: pagesize = int(pageinfo['tocPageSize'])
512: url = docinfo['url']
513: urlmode = docinfo['mode']
514: selfurl = docinfo['viewerUrl']
515: viewMode= pageinfo['viewMode']
516: tocMode = pageinfo['tocMode']
517: tocPN = int(pageinfo['tocPN'])
518: pn = tocPN
519:
520: fulltoc = ET.fromstring(tocxml)
521:
522: if fulltoc:
523: # paginate
524: start = (pn - 1) * pagesize * 2
525: len = pagesize * 2
526: del fulltoc[:start]
527: del fulltoc[len:]
528: tocdivs = fulltoc
529:
530: # check all a-tags
531: links = tocdivs.findall(".//a")
532: for l in links:
533: href = l.get('href')
534: if href:
535: # take pn from href
536: m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
537: if m is not None:
538: # and create new url
539: l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN))
540: else:
541: logging.warning("getTocPage: Problem with link=%s"%href)
542:
543: return serialize(tocdivs)
544:
545:
546: def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
547: """change settings"""
548: self.title=title
549: self.timeout = timeout
550: self.serverUrl = serverUrl
551: if RESPONSE is not None:
552: RESPONSE.redirect('manage_main')
553:
554: # management methods
555: def manage_addMpdlXmlTextServerForm(self):
556: """Form for adding"""
557: pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
558: return pt()
559:
560: def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
561: #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
562: """add zogiimage"""
563: newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
564: self.Destination()._setObject(id, newObj)
565: if RESPONSE is not None:
566: RESPONSE.redirect('manage_main')
567:
568:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>