comparison MpdlXmlTextServer.py @ 453:beb7ccb92564 elementtree

first version using elementtree instead of 4suite xml
author casties
date Thu, 14 Jul 2011 19:43:56 +0200
parents 1cea48640992
children 0a53fea83df7
comparison
equal deleted inserted replaced
408:4e84c53e49b3 453:beb7ccb92564
1 1
2 from OFS.SimpleItem import SimpleItem 2 from OFS.SimpleItem import SimpleItem
3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile 3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
4 from Ft.Xml import EMPTY_NAMESPACE, Parse 4 from Ft.Xml import EMPTY_NAMESPACE, Parse
5 5 from Ft.Xml.Domlette import NonvalidatingReader
6 import Ft.Xml.Domlette
7 import cStringIO
8
9 import xml.etree.ElementTree as ET
10
11 import md5
6 import sys 12 import sys
7 import logging 13 import logging
8 import urllib 14 import urllib
9 import documentViewer 15 import documentViewer
10 from documentViewer import getTextFromNode, serializeNode 16 #from documentViewer import getTextFromNode, serializeNode
17
18 def getText(node):
19 """get the cdata content of a node"""
20 if node is None:
21 return ""
22 # ET:
23 text = node.text or ""
24 for e in node:
25 text += gettext(e)
26 if e.tail:
27 text += e.tail
28
29 return text
30
31 def serialize(node):
32 """returns a string containing an XML snippet of node"""
33 s = ET.tostring(node, 'UTF-8')
34 # snip off XML declaration
35 if s.startswith('<?xml'):
36 i = s.find('?>')
37 return s[i+3:]
38
39 return s
40
41
42 def getTextFromNode(node):
43 """get the cdata content of a node"""
44 if node is None:
45 return ""
46 # ET:
47 #text = node.text or ""
48 #for e in node:
49 # text += gettext(e)
50 # if e.tail:
51 # text += e.tail
52
53 # 4Suite:
54 nodelist=node.childNodes
55 text = ""
56 for n in nodelist:
57 if n.nodeType == node.TEXT_NODE:
58 text = text + n.data
59
60 return text
61
62 def serializeNode(node, encoding="utf-8"):
63 """returns a string containing node as XML"""
64 #s = ET.tostring(node)
65
66 # 4Suite:
67 stream = cStringIO.StringIO()
68 Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding)
69 s = stream.getvalue()
70 stream.close()
71
72 return s
73
11 74
12 class MpdlXmlTextServer(SimpleItem): 75 class MpdlXmlTextServer(SimpleItem):
13 """TextServer implementation for MPDL-XML eXist server""" 76 """TextServer implementation for MPDL-XML eXist server"""
14 meta_type="MPDL-XML TextServer" 77 meta_type="MPDL-XML TextServer"
15 78
17 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, 80 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
18 )+SimpleItem.manage_options 81 )+SimpleItem.manage_options
19 82
20 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) 83 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
21 84
22 def __init__(self,id,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 85 def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
86 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40):
87
23 """constructor""" 88 """constructor"""
24 self.id=id 89 self.id=id
25 self.title=title 90 self.title=title
26 self.timeout = timeout 91 self.timeout = timeout
27 if serverName is None: 92 if serverName is None:
36 def getServerData(self, method, data=None): 101 def getServerData(self, method, data=None):
37 """returns result from text server for method+data""" 102 """returns result from text server for method+data"""
38 url = self.serverUrl+method 103 url = self.serverUrl+method
39 return documentViewer.getHttpData(url,data,timeout=self.timeout) 104 return documentViewer.getHttpData(url,data,timeout=self.timeout)
40 105
41 def getSearch(self, pn=1, pageinfo=None, docinfo=None, query=None, queryType=None, lemma=None, characterNormalization=None, optionToggle=None): 106 def getSearch(self, pageinfo=None, docinfo=None):
42 """get search list""" 107 """get search list"""
43 docpath = docinfo['textURLPath'] 108 docpath = docinfo['textURLPath']
44 url = docinfo['url'] 109 url = docinfo['url']
45 pagesize = pageinfo['queryPageSize'] 110 pagesize = pageinfo['queryPageSize']
46 pn = pageinfo['searchPN'] 111 pn = pageinfo.get('searchPN',1)
47 sn = pageinfo['sn'] 112 sn = pageinfo['sn']
48 highlightQuery = pageinfo['highlightQuery'] 113 highlightQuery = pageinfo['highlightQuery']
49 query =pageinfo['query'] 114 query =pageinfo['query']
50 queryType =pageinfo['queryType'] 115 queryType =pageinfo['queryType']
51 viewMode= pageinfo['viewMode'] 116 viewMode= pageinfo['viewMode']
52 tocMode = pageinfo['tocMode'] 117 tocMode = pageinfo['tocMode']
53 characterNormalization = pageinfo['characterNormalization'] 118 characterNormalization = pageinfo['characterNormalization']
54 optionToggle = pageinfo['optionToggle'] 119 #optionToggle = pageinfo['optionToggle']
55 tocPN = pageinfo['tocPN'] 120 tocPN = pageinfo['tocPN']
56 selfurl = self.absolute_url() 121 selfurl = self.absolute_url()
57 122 data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery)))
58 data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&optionToggle=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization,optionToggle ,urllib.quote(highlightQuery)))
59 #page=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&highlightQuery=%s"%(docpath, 'text', queryType, query, pagesize, pn, sn, viewMode,highlightQuery) ,outputUnicode=False)
60
61 pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) 123 pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url)
62 pagedom = Parse(pagexml) 124 pagedom = Parse(pagexml)
125
126 """
127 pagedivs = pagedom.xpath("//div[@class='queryResultHits']")
128 if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")):
129 if len(pagedivs)>0:
130 docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0]))
131 s = getTextFromNode(pagedivs[0])
132 s1 = int(s)/10+1
133 try:
134 docinfo['queryResultHits'] = int(s1)
135 logging.debug("SEARCH ENTRIES: %s"%(s1))
136 except:
137 docinfo['queryResultHits'] = 0
138 """
63 if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): 139 if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"):
64 pagedivs = pagedom.xpath("//div[@class='queryResultPage']") 140 pagedivs = pagedom.xpath("//div[@class='queryResultPage']")
65 if len(pagedivs)>0: 141 if len(pagedivs)>0:
66 pagenode=pagedivs[0] 142 pagenode=pagedivs[0]
67 links=pagenode.xpath("//a") 143 links=pagenode.xpath("//a")
69 hrefNode = l.getAttributeNodeNS(None, u"href") 145 hrefNode = l.getAttributeNodeNS(None, u"href")
70 if hrefNode: 146 if hrefNode:
71 href = hrefNode.nodeValue 147 href = hrefNode.nodeValue
72 if href.startswith('page-fragment.xql'): 148 if href.startswith('page-fragment.xql'):
73 selfurl = self.absolute_url() 149 selfurl = self.absolute_url()
74 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,optionToggle,characterNormalization)) 150 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization))
75 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 151 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
152 #logging.debug("PUREXML :%s"%(serializeNode(pagenode)))
76 return serializeNode(pagenode) 153 return serializeNode(pagenode)
77 if (queryType=="fulltextMorph"): 154 if (queryType=="fulltextMorph"):
78 pagedivs = pagedom.xpath("//div[@class='queryResult']") 155 pagedivs = pagedom.xpath("//div[@class='queryResult']")
79 if len(pagedivs)>0: 156 if len(pagedivs)>0:
80 pagenode=pagedivs[0] 157 pagenode=pagedivs[0]
83 hrefNode = l.getAttributeNodeNS(None, u"href") 160 hrefNode = l.getAttributeNodeNS(None, u"href")
84 if hrefNode: 161 if hrefNode:
85 href = hrefNode.nodeValue 162 href = hrefNode.nodeValue
86 if href.startswith('page-fragment.xql'): 163 if href.startswith('page-fragment.xql'):
87 selfurl = self.absolute_url() 164 selfurl = self.absolute_url()
88 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,optionToggle,characterNormalization)) 165 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization))
89 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 166 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
90 if href.startswith('../lt/lemma.xql'): 167 if href.startswith('../lt/lemma.xql'):
91 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma_New'%(selfurl)) 168 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl))
92 l.setAttributeNS(None, 'target', '_blank') 169 l.setAttributeNS(None, 'target', '_blank')
93 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 170 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
94 l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 171 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
95 pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") 172 pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']")
96 return serializeNode(pagenode) 173 return serializeNode(pagenode)
97 if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): 174 if (queryType=="ftIndex")or(queryType=="ftIndexMorph"):
98 pagedivs= pagedom.xpath("//div[@class='queryResultPage']") 175 pagedivs= pagedom.xpath("//div[@class='queryResultPage']")
99 if len(pagedivs)>0: 176 if len(pagedivs)>0:
101 links=pagenode.xpath("//a") 178 links=pagenode.xpath("//a")
102 for l in links: 179 for l in links:
103 hrefNode = l.getAttributeNodeNS(None, u"href") 180 hrefNode = l.getAttributeNodeNS(None, u"href")
104 if hrefNode: 181 if hrefNode:
105 href = hrefNode.nodeValue 182 href = hrefNode.nodeValue
106 hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,optionToggle,characterNormalization)) 183 hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization))
107 if href.startswith('../lt/lex.xql'): 184 if href.startswith('../lt/lex.xql'):
108 hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_voc'%selfurl) 185 hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl)
109 l.setAttributeNS(None, 'target', '_blank') 186 l.setAttributeNS(None, 'target', '_blank')
110 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 187 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
111 l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 188 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
112 if href.startswith('../lt/lemma.xql'): 189 if href.startswith('../lt/lemma.xql'):
113 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%selfurl) 190 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl))
114 l.setAttributeNS(None, 'target', '_blank') 191 l.setAttributeNS(None, 'target', '_blank')
115 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 192 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
116 l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 193 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
117 return serializeNode(pagenode) 194 return serializeNode(pagenode)
118 return "no text here" 195 return "no text here"
119 196
120 """def getNumPages(self, docinfo):
121 ""get list of pages from fulltext and put in docinfo""
122 if 'numPages' in docinfo:
123 # already there
124 return docinfo
125 xquery = '//pb'
126 text = self.getServerData("xquery.xql","document=%s&xquery=%s"%(docinfo['textURLPath'],xquery))
127 docinfo['numPages'] = text.count("<pb ")
128 return docinfo
129 """
130 def getNumTextPages (self, docinfo):
131 """get list of pages from fulltext (texts without images) and put in docinfo"""
132 if 'numPages' in docinfo:
133 # allredy there
134 return docinfo
135 xpath ='/count(//pb)'
136 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'], xpath))
137 dom = Parse(text)
138 result= dom.xpath("//result/resultPage")
139
140 docinfo['numPages']=int(getTextFromNode(result[0]))
141 return docinfo
142
143 def getGisPlaces(self, docinfo=None, pageinfo=None): 197 def getGisPlaces(self, docinfo=None, pageinfo=None):
144 """ Show all Gis Places of whole Page""" 198 """ Show all Gis Places of whole Page"""
145 xpath='//place' 199 xpath='//place'
146 docpath = docinfo['textURLPath'] 200 docpath = docinfo.get('textURLPath',None)
201 if not docpath:
202 return None
203
147 url = docinfo['url'] 204 url = docinfo['url']
148 selfurl = self.absolute_url() 205 selfurl = self.absolute_url()
149 pn = pageinfo['current'] 206 pn = pageinfo['current']
150 hrefList=[] 207 hrefList=[]
151 myList= "" 208 myList= ""
155 for l in result: 212 for l in result:
156 hrefNode= l.getAttributeNodeNS(None, u"id") 213 hrefNode= l.getAttributeNodeNS(None, u"id")
157 href= hrefNode.nodeValue 214 href= hrefNode.nodeValue
158 hrefList.append(href) 215 hrefList.append(href)
159 myList = ",".join(hrefList) 216 myList = ",".join(hrefList)
160 logging.debug("getGisPlaces :%s"%(myList)) 217 #logging.debug("getGisPlaces :%s"%(myList))
161 return myList 218 return myList
162 219
163 def getAllGisPlaces (self, docinfo=None, pageinfo=None): 220 def getAllGisPlaces (self, docinfo=None, pageinfo=None):
164 """Show all Gis Places of whole Book """ 221 """Show all Gis Places of whole Book """
165 xpath ='//echo:place' 222 xpath ='//echo:place'
176 for l in result: 233 for l in result:
177 hrefNode = l.getAttributeNodeNS(None, u"id") 234 hrefNode = l.getAttributeNodeNS(None, u"id")
178 href= hrefNode.nodeValue 235 href= hrefNode.nodeValue
179 hrefList.append(href) 236 hrefList.append(href)
180 myList = ",".join(hrefList) 237 myList = ",".join(hrefList)
181 logging.debug("getALLGisPlaces :%s"%(myList)) 238 #logging.debug("getALLGisPlaces :%s"%(myList))
182 return myList 239 return myList
183 240
184
185 def getPDF (self, docinfo=None, pageinfo=None):
186 """Show and Save different Pages as PDF in Options"""
187 selfurl=self.absolute_url()
188 pn=pageinfo['current']
189
190 viewMode =pageinfo['viewMode']
191
192 #text = ("page-fragment.xql","document=%s&mode=%s&pn=%s&export=%s"%(docinfo['textURLPath'], 'text', pn,'pdf'))
193 #text = self.getServerData("page-fragment.xql", "document=%s&mode=%s&pn=%s&export=%s"(docinfo['textURLPath'],'text', pn,'pdf'))
194 #logging.debug("text :%s"%(text))
195 #dom =Parse(text)
196 #logging.debug("text :%s"%(text))
197 #return text
198 241
199 def getOrigPages (self, docinfo=None, pageinfo=None): 242 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None):
200 """Show original page """
201 docpath = docinfo['textURLPath']
202 logging.debug ("docinfo['textURLPath']=%s"%(docinfo['textURLPath']))
203 #url = docinfo['url']
204 selfurl = self.absolute_url()
205 pn =pageinfo['current']
206
207 viewMode= pageinfo['viewMode']
208 text = self.getServerData("page-fragment.xql","document=%s&mode=%s&pn=%s"%(docinfo['textURLPath'], 'text', pn))
209 dom =Parse(text)
210 pagedivs = dom.xpath("//div[@class='pageNumberOrig']")
211 logging.debug("YYYYYYpagedivs :%s"%(pagedivs))
212 if len(pagedivs)>0:
213 originalPage= getTextFromNode(pagedivs[0])
214 #return docinfo['originalPage']
215 return originalPage
216
217
218 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None, viewMode=None, tocMode=None, tocPN=None, characterNormalization="regPlusNorm", highlightQuery=None, sn=None, optionToggle=None):
219 """returns single page from fulltext""" 243 """returns single page from fulltext"""
220 docpath = docinfo['textURLPath'] 244 docpath = docinfo['textURLPath']
221 path = docinfo['textURLPath'] 245 path = docinfo['textURLPath']
222 url = docinfo['url'] 246 url = docinfo.get('url',None)
223 name = docinfo['name'] 247 name = docinfo.get('name',None)
224 viewMode= pageinfo['viewMode'] 248 pn =pageinfo['current']
225 sn = pageinfo['sn'] 249 sn = pageinfo['sn']
250 #optionToggle =pageinfo ['optionToggle']
226 highlightQuery = pageinfo['highlightQuery'] 251 highlightQuery = pageinfo['highlightQuery']
227 252 #mode = pageinfo ['viewMode']
228 tocMode = pageinfo['tocMode'] 253 tocMode = pageinfo['tocMode']
229 characterNormalization=pageinfo['characterNormalization'] 254 characterNormalization=pageinfo['characterNormalization']
230 tocPN = pageinfo['tocPN'] 255 tocPN = pageinfo['tocPN']
231 selfurl = self.absolute_url() 256 selfurl = self.absolute_url()
232 if mode == "text_dict": 257 if mode == "text_dict":
233 textmode = "textPollux" 258 textmode = "textPollux"
234 else: 259 else:
235 textmode = mode 260 textmode = mode
236 #logging.debug("documentViewer highlightQuery: %s"%(highlightQuery)) 261
237 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) 262 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization)
238 if highlightQuery is not None: 263 if highlightQuery is not None:
239 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) 264 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn)
240 #logging.debug("documentViewer highlightQuery: %s"%(highlightQuery)) 265
241 pagexml = self.getServerData("page-fragment.xql",textParam) 266 pagexml = self.getServerData("page-fragment.xql",textParam)
242 logging.debug("documentViewer highlightQuery: %s"%(highlightQuery)) 267 dom = ET.fromstring(pagexml)
243 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/page-fragment.xql", textParam, outputUnicode=False) 268 #dom = NonvalidatingReader.parseStream(pagexml)
244 269
245 pagedom = Parse(pagexml) 270 #original Pages
271 #pagedivs = dom.xpath("//div[@class='pageNumberOrig']")
272
273 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"):
274 if len(pagedivs)>0:
275 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0])
276 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig']))
277
278 #original Pages Norm
279 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']")
280 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"):
281 if len(pagedivs)>0:
282 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0])
283 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm']))
284 """
285 #figureEntries
286 # pagedivs = dom.xpath("//div[@class='countFigureEntries']")
287 # if pagedivs == dom.xpath("//div[@class='countFigureEntries']"):
288 # if len(pagedivs)>0:
289 # docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0])
290 # s = getTextFromNode(pagedivs[0])
291 # if s=='0':
292 # try:
293 # docinfo['countFigureEntries'] = int(s)
294 # except:
295 # docinfo['countFigureEntries'] = 0
296 # else:
297 # s1 = int(s)/30+1
298 # try:
299 # docinfo['countFigureEntries'] = int(s1)
300 # except:
301 # docinfo['countFigureEntries'] = 0
302 #
303 # #allPlaces
304 # pagedivs = dom.xpath("//div[@class='countPlaces']")
305 # if pagedivs == dom.xpath("//div[@class='countPlaces']"):
306 # if len(pagedivs)>0:
307 # docinfo['countPlaces']= getTextFromNode(pagedivs[0])
308 # s = getTextFromNode(pagedivs[0])
309 # try:
310 # docinfo['countPlaces'] = int(s)
311 # except:
312 # docinfo['countPlaces'] = 0
313 #
314 # #tocEntries
315 # pagedivs = dom.xpath("//div[@class='countTocEntries']")
316 # if pagedivs == dom.xpath("//div[@class='countTocEntries']"):
317 # if len(pagedivs)>0:
318 # docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0]))
319 # s = getTextFromNode(pagedivs[0])
320 # if s=='0':
321 # try:
322 # docinfo['countTocEntries'] = int(s)
323 # except:
324 # docinfo['countTocEntries'] = 0
325 # else:
326 # s1 = int(s)/30+1
327 # try:
328 # docinfo['countTocEntries'] = int(s1)
329 # except:
330 # docinfo['countTocEntries'] = 0
331
332 #numTextPages
333 #pagedivs = dom.xpath("//div[@class='countPages']")
334 alldivs = dom.findall(".//div")
335 pagediv = None
336 for div in alldivs:
337 dc = div.get('class')
338 if dc == 'pageContent':
339 pagediv = div
340
341 if dc == 'countPages':
342 try:
343 np = int(div.text)
344 docinfo['numPages'] = np
345 pageinfo['end'] = min(pageinfo['end'], np)
346 pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
347 if np % pageinfo['groupsize'] > 0:
348 pageinfo['numgroups'] += 1
349
350 except:
351 docinfo['numPages'] = 0
352
353 break
354
355 # ROC: why?
356 # else:
357 # #no full text -- init to 0
358 # docinfo['pageNumberOrig'] = 0
359 # docinfo['countFigureEntries'] = 0
360 # docinfo['countPlaces'] = 0
361 # docinfo['countTocEntries'] = 0
362 # docinfo['numPages'] = 0
363 # docinfo['pageNumberOrigNorm'] = 0
364 # #return docinfo
365
246 # plain text mode 366 # plain text mode
247 if mode == "text": 367 if mode == "text":
248 # first div contains text 368 #pagedivs = dom.xpath("/div")
249 pagedivs = pagedom.xpath("/div") 369 if pagediv:
250 if len(pagedivs) > 0: 370 links = pagediv.findall(".//a")
251 pagenode = pagedivs[0]
252 links = pagenode.xpath("//a")
253 for l in links: 371 for l in links:
254 hrefNode = l.getAttributeNodeNS(None, u"href") 372 href = l.get('href')
255 if hrefNode: 373 if href and href.startswith('#note-'):
256 href= hrefNode.nodeValue 374 href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))
257 if href.startswith('#note-'): 375 l.set('href', href)
258 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,tocPN,pn)) 376 logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8'))
259 return serializeNode(pagenode) 377 return serialize(pagediv)
378
260 if mode == "xml": 379 if mode == "xml":
261 # first div contains text 380 if pagediv:
262 pagedivs = pagedom.xpath("/div") 381 return serialize(pagediv)
263 if len(pagedivs) > 0: 382
264 pagenode = pagedivs[0] 383 if mode == "pureXml":
265 return serializeNode(pagenode) 384 if pagediv:
385 return serialize(pagediv)
386
266 if mode == "gis": 387 if mode == "gis":
267 # first div contains text 388 if pagediv:
268 pagedivs = pagedom.xpath("/div") 389 # check all a-tags
269 if len(pagedivs) > 0: 390 links = pagediv.findall(".//a")
270 pagenode = pagedivs[0] 391 for l in links:
271 links =pagenode.xpath("//a") 392 href = l.get('href')
272 for l in links: 393 if href:
273 hrefNode =l.getAttributeNodeNS(None, u"href") 394 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'):
274 if hrefNode: 395 l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name))
275 href=hrefNode.nodeValue 396 l.set('target', '_blank')
276 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): 397
277 hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) 398 return serialize(pagenode)
278 l.setAttributeNS(None, 'target', '_blank')
279 return serializeNode(pagenode)
280 399
281 if mode == "pureXml":
282 # first div contains text
283 pagedivs = pagedom.xpath("/div")
284 if len(pagedivs) > 0:
285 pagenode = pagedivs[0]
286 return serializeNode(pagenode)
287 # text-with-links mode 400 # text-with-links mode
288 if mode == "text_dict": 401 if mode == "text_dict":
289 # first div contains text 402 if pagediv:
290 pagedivs = pagedom.xpath("/div")
291 if len(pagedivs) > 0:
292 pagenode = pagedivs[0]
293 # check all a-tags 403 # check all a-tags
294 links = pagenode.xpath("//a") 404 links = pagediv.findall(".//a")
295 for l in links: 405 for l in links:
296 hrefNode = l.getAttributeNodeNS(None, u"href") 406 href = l.get('href')
297 if hrefNode: 407
408 if href:
298 # is link with href 409 # is link with href
299 href = hrefNode.nodeValue 410 if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'):
300 if href.startswith('lt/lex.xql'):
301 # is pollux link 411 # is pollux link
302 selfurl = self.absolute_url() 412 selfurl = self.absolute_url()
303 # change href 413 # change href
304 hrefNode.nodeValue = href.replace('lt/lex.xql','%s/template/head_main_voc'%selfurl) 414 l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl))
305 # add target 415 # add target
306 l.setAttributeNS(None, 'target', '_blank') 416 l.set('target', '_blank')
307 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;") 417
308 l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 418 if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
309 if href.startswith('lt/lemma.xql'):
310 selfurl = self.absolute_url() 419 selfurl = self.absolute_url()
311 hrefNode.nodeValue = href.replace('lt/lemma.xql','%s/template/head_main_lemma'%selfurl) 420 l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
312 l.setAttributeNS(None, 'target', '_blank') 421 l.set('target', '_blank')
313 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;") 422 l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
314 l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 423 l.set('ondblclick', 'popupWin.focus();')
424
315 if href.startswith('#note-'): 425 if href.startswith('#note-'):
316 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,tocPN,pn)) 426 l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)))
317 return serializeNode(pagenode) 427
428 return serialize(pagediv)
429
318 return "no text here" 430 return "no text here"
319 431
320 def getTranslate(self, query=None, language=None): 432 def getOrigPages(self, docinfo=None, pageinfo=None):
433 docpath = docinfo['textURLPath']
434 pn =pageinfo['current']
435 selfurl = self.absolute_url()
436 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn))
437 dom = Parse(pagexml)
438 pagedivs = dom.xpath("//div[@class='pageNumberOrig']")
439 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"):
440 if len(pagedivs)>0:
441 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0])
442 return docinfo['pageNumberOrig']
443
444 def getOrigPagesNorm(self, docinfo=None, pageinfo=None):
445 docpath = docinfo['textURLPath']
446 pn =pageinfo['current']
447 selfurl = self.absolute_url()
448 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn))
449 dom = Parse(pagexml)
450 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']")
451 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"):
452 if len(pagedivs)>0:
453 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0])
454 return docinfo['pageNumberOrigNorm']
455
456
457 def getTranslate(self, word=None, language=None):
321 """translate into another languages""" 458 """translate into another languages"""
322 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) 459 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html")
323 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) 460 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query)))
324 return data 461 return data
325 462
326 def getLemma(self, lemma=None, language=None): 463 def getLemma(self, lemma=None, language=None):
327 """simular words lemma """ 464 """simular words lemma """
328 data = self.getServerData("lt/lemma.xql","document=&language="+str(language)+"&lemma="+urllib.quote(lemma)) 465 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html")
329 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(lemma)))
330 return data 466 return data
331 467
332 def getLemmaNew(self, query=None, language=None): 468 def getLemmaQuery(self, query=None, language=None):
333 """simular words lemma """ 469 """simular words lemma """
334 data = self.getServerData("lt/lemma.xql","document=&language="+str(language)+"&lemma="+urllib.quote(query)) 470 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html")
335 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(query)))
336 return data 471 return data
337 472
338 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1, optionToggle=None): 473 def getLex(self, query=None, language=None):
339 """number of""" 474 #simular words lemma
475 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query))
476 return data
477
478 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1):
479 #number of
340 docpath = docinfo['textURLPath'] 480 docpath = docinfo['textURLPath']
341 pagesize = pageinfo['queryPageSize'] 481 pagesize = pageinfo['queryPageSize']
342 pn = pageinfo['searchPN'] 482 pn = pageinfo['searchPN']
343 query =pageinfo['query'] 483 query =pageinfo['query']
344 queryType =pageinfo['queryType'] 484 queryType =pageinfo['queryType']
345 tocSearch = 0 485 tocSearch = 0
346 tocDiv = None 486 tocDiv = None
347 487
348 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) 488 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn))
349 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn) ,outputUnicode=False)
350 pagedom = Parse(pagexml) 489 pagedom = Parse(pagexml)
351 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 490 numdivs = pagedom.xpath("//div[@class='queryResultHits']")
352 tocSearch = int(getTextFromNode(numdivs[0])) 491 tocSearch = int(getTextFromNode(numdivs[0]))
353 logging.debug("documentViewer (gettoc) tocSearch: %s"%(tocSearch))
354 tc=int((tocSearch/10)+1) 492 tc=int((tocSearch/10)+1)
355 logging.debug("documentViewer (gettoc) tc: %s"%(tc))
356 return tc 493 return tc
357 494
358 def getQueryResultHits(self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1, optionsClose=None):
359
360 """number of hits in Search mode"""
361 docpath = docinfo['textURLPath']
362 pagesize = pageinfo['queryPageSize']
363 pn = pageinfo['searchPN']
364 query =pageinfo['query']
365 queryType =pageinfo['queryType']
366 tocSearch = 0
367 tocDiv = None
368
369 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn))
370 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn) ,outputUnicode=False)
371 pagedom = Parse(pagexml)
372 numdivs = pagedom.xpath("//div[@class='queryResultHits']")
373 tocSearch = int(getTextFromNode(numdivs[0]))
374 tc=int((tocSearch/10)+1)
375 return tc
376
377 def getQueryResultHitsText(self, docinfo=None, pageinfo=None):
378 """number of hits in Text of Contents mode"""
379
380 docpath = docinfo['textURLPath']
381 pagesize = pageinfo['queryPageSize']
382 pn = pageinfo['searchPN']
383 query =pageinfo['query']
384 queryType =pageinfo['queryType']
385 tocSearch = 0
386 tocDiv = None
387 tocMode = pageinfo['tocMode']
388 tocPN = pageinfo['tocPN']
389 pagexml=self.getServerData("doc-query.xql", "document=%s&queryType=%s"%(docpath,'toc'))
390 pagedom = Parse(pagexml)
391 logging.debug("documentViewer (pagedom) pagedom: %s"%(pagedom))
392 numdivs = pagedom.xpath("//div[@class='queryResultHits']")
393 tocSearch = int(getTextFromNode(numdivs[0]))
394 tc=int((tocSearch/30)+1)
395 return tc
396
397 def getQueryResultHitsFigures(self, docinfo=None, pageinfo=None):
398 """number of hits in Text of Figures mode"""
399
400 docpath = docinfo['textURLPath']
401 pagesize = pageinfo['queryPageSize']
402 pn = pageinfo['searchPN']
403 query =pageinfo['query']
404 queryType =pageinfo['queryType']
405 tocSearch = 0
406 tocDiv = None
407 tocMode = pageinfo['tocMode']
408 tocPN = pageinfo['tocPN']
409 pagexml=self.getServerData("doc-query.xql", "document=%s&queryType=%s"%(docpath,'figures'))
410 pagedom = Parse(pagexml)
411 logging.debug("documentViewer (pagedom) pagedom: %s"%(pagedom))
412 numdivs = pagedom.xpath("//div[@class='queryResultHits']")
413 tocSearch = int(getTextFromNode(numdivs[0]))
414 tc=int((tocSearch/30)+1)
415 return tc
416
417
418 def getToc(self, mode="text", docinfo=None): 495 def getToc(self, mode="text", docinfo=None):
419 """loads table of contents and stores in docinfo""" 496 """loads table of contents and stores in docinfo"""
420 logging.debug("documentViewer (gettoc) mode: %s"%(mode))
421 if mode == "none": 497 if mode == "none":
422 return docinfo 498 return docinfo
423 if 'tocSize_%s'%mode in docinfo: 499 if 'tocSize_%s'%mode in docinfo:
424 # cached toc 500 # cached toc
425 return docinfo 501 return docinfo
435 # number of entries in toc 511 # number of entries in toc
436 tocSize = 0 512 tocSize = 0
437 tocDiv = None 513 tocDiv = None
438 514
439 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 515 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
440 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql", "document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType,pagesize,pn), outputUnicode=False) 516
441 # post-processing downloaded xml 517 # post-processing downloaded xml
442 pagedom = Parse(pagexml) 518 pagedom = Parse(pagexml)
443 # get number of entries 519 # get number of entries
444 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 520 numdivs = pagedom.xpath("//div[@class='queryResultHits']")
445 if len(numdivs) > 0: 521 if len(numdivs) > 0:
460 pn = pageinfo['tocPN'] 536 pn = pageinfo['tocPN']
461 url = docinfo['url'] 537 url = docinfo['url']
462 selfurl = self.absolute_url() 538 selfurl = self.absolute_url()
463 viewMode= pageinfo['viewMode'] 539 viewMode= pageinfo['viewMode']
464 characterNormalization = pageinfo ['characterNormalization'] 540 characterNormalization = pageinfo ['characterNormalization']
465 optionToggle =pageinfo ['optionToggle'] 541 #optionToggle =pageinfo ['optionToggle']
466 tocMode = pageinfo['tocMode'] 542 tocMode = pageinfo['tocMode']
467 tocPN = pageinfo['tocPN'] 543 tocPN = pageinfo['tocPN']
468 544
469 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm&optionToggle=1"%(docpath,queryType, pagesize, pn)) 545 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn))
470 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&optionToggle=1'%(selfurl,url, viewMode, tocMode, tocPN)) 546 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN))
471 text = page.replace('mode=image','mode=texttool') 547 text = page.replace('mode=image','mode=texttool')
472 logging.debug("documentViewer (characterNormalization) characterNormalization: %s"%(characterNormalization))
473 #logging.debug("documentViewer (characterNormalization) text: %s"%(text))
474 return text 548 return text
475 549
476 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 550 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
551 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
477 """change settings""" 552 """change settings"""
478 self.title=title 553 self.title=title
479 self.timeout = timeout 554 self.timeout = timeout
480 self.serverUrl = serverUrl 555 self.serverUrl = serverUrl
481 if RESPONSE is not None: 556 if RESPONSE is not None:
485 def manage_addMpdlXmlTextServerForm(self): 560 def manage_addMpdlXmlTextServerForm(self):
486 """Form for adding""" 561 """Form for adding"""
487 pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) 562 pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
488 return pt() 563 return pt()
489 564
490 def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 565 def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
566 #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
491 """add zogiimage""" 567 """add zogiimage"""
492 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) 568 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
493 self.Destination()._setObject(id, newObj) 569 self.Destination()._setObject(id, newObj)
494 if RESPONSE is not None: 570 if RESPONSE is not None:
495 RESPONSE.redirect('manage_main') 571 RESPONSE.redirect('manage_main')
496
497
498