comparison MpdlXmlTextServer.py @ 513:67095296c95a

Merge from elementtree branch 92a6443a6f16ff25674d43814ec0d6c0a43a5e1a
author casties
date Tue, 28 Feb 2012 19:10:08 +0100
parents 91daab0c219b 551ca1641a5e
children 7d7b639d7be7
comparison
equal deleted inserted replaced
497:73fb73577961 513:67095296c95a
1
2 from OFS.SimpleItem import SimpleItem 1 from OFS.SimpleItem import SimpleItem
3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile 2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
4 from Ft.Xml import EMPTY_NAMESPACE, Parse 3
5 from Ft.Xml.Domlette import NonvalidatingReader 4 import xml.etree.ElementTree as ET
6 5
7 import md5 6 import re
8 import sys
9 import logging 7 import logging
10 import urllib 8 import urllib
11 import documentViewer 9 import urlparse
12 from documentViewer import getTextFromNode, serializeNode 10 import base64
11
12 from SrvTxtUtils import getInt, getText, getHttpData
13
14 def serialize(node):
15 """returns a string containing an XML snippet of node"""
16 s = ET.tostring(node, 'UTF-8')
17 # snip off XML declaration
18 if s.startswith('<?xml'):
19 i = s.find('?>')
20 return s[i+3:]
21
22 return s
23
13 24
14 class MpdlXmlTextServer(SimpleItem): 25 class MpdlXmlTextServer(SimpleItem):
15 """TextServer implementation for MPDL-XML eXist server""" 26 """TextServer implementation for MPDL-XML eXist server"""
16 meta_type="MPDL-XML TextServer" 27 meta_type="MPDL-XML TextServer"
17 28
19 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, 30 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
20 )+SimpleItem.manage_options 31 )+SimpleItem.manage_options
21 32
22 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) 33 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
23 34
24 def __init__(self,id,title="",serverUrl="http://mpdl-test.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 35 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
25 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40):
26
27 """constructor""" 36 """constructor"""
28 self.id=id 37 self.id=id
29 self.title=title 38 self.title=title
30 self.timeout = timeout 39 self.timeout = timeout
31 if serverName is None: 40 if serverName is None:
33 else: 42 else:
34 self.serverUrl = "http://%s/mpdl/interface/"%serverName 43 self.serverUrl = "http://%s/mpdl/interface/"%serverName
35 44
36 def getHttpData(self, url, data=None): 45 def getHttpData(self, url, data=None):
37 """returns result from url+data HTTP request""" 46 """returns result from url+data HTTP request"""
38 return documentViewer.getHttpData(url,data,timeout=self.timeout) 47 return getHttpData(url,data,timeout=self.timeout)
39 48
40 def getServerData(self, method, data=None): 49 def getServerData(self, method, data=None):
41 """returns result from text server for method+data""" 50 """returns result from text server for method+data"""
42 url = self.serverUrl+method 51 url = self.serverUrl+method
43 return documentViewer.getHttpData(url,data,timeout=self.timeout) 52 return getHttpData(url,data,timeout=self.timeout)
44 53
45 def getSearch(self, pageinfo=None, docinfo=None): 54
46 """get search list""" 55 def getPlacesOnPage(self, docinfo=None, pn=None):
47 docpath = docinfo['textURLPath'] 56 """Returns list of GIS places of page pn"""
48 url = docinfo['url']
49 pagesize = pageinfo['queryPageSize']
50 pn = pageinfo.get('searchPN',1)
51 #sn = pageinfo['sn']
52 s = pageinfo['s']
53 highlightElementPos =pageinfo ['highlightElementPos']
54 highlightElement = pageinfo ['highlightElement']
55
56 highlightQuery = pageinfo['highlightQuery']
57 query =pageinfo['query']
58 queryType =pageinfo['queryType']
59 viewMode= pageinfo['viewMode']
60 tocMode = pageinfo['tocMode']
61 characterNormalization = pageinfo['characterNormalization']
62 #optionToggle = pageinfo['optionToggle']
63 tocPN = pageinfo['tocPN']
64 selfurl = self.absolute_url()
65 data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
66 #data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery)))
67 pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url)
68 pagedom = Parse(pagexml)
69
70 """
71 pagedivs = pagedom.xpath("//div[@class='queryResultHits']")
72 if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")):
73 if len(pagedivs)>0:
74 docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0]))
75 s = getTextFromNode(pagedivs[0])
76 s1 = int(s)/10+1
77 try:
78 docinfo['queryResultHits'] = int(s1)
79 logging.debug("SEARCH ENTRIES: %s"%(s1))
80 except:
81 docinfo['queryResultHits'] = 0
82 """
83 if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"):
84 pagedivs = pagedom.xpath("//div[@class='queryResultPage']")
85 if len(pagedivs)>0:
86 pagenode=pagedivs[0]
87 links=pagenode.xpath("//a")
88 for l in links:
89 hrefNode = l.getAttributeNodeNS(None, u"href")
90 if hrefNode:
91 href = hrefNode.nodeValue
92 if href.startswith('page-fragment.xql'):
93 selfurl = self.absolute_url()
94 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization))
95 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
96 #logging.debug("PUREXML :%s"%(serializeNode(pagenode)))
97 return serializeNode(pagenode)
98 if (queryType=="fulltextMorph"):
99 pagedivs = pagedom.xpath("//div[@class='queryResult']")
100 if len(pagedivs)>0:
101 pagenode=pagedivs[0]
102 links=pagenode.xpath("//a")
103 for l in links:
104 hrefNode = l.getAttributeNodeNS(None, u"href")
105 if hrefNode:
106 href = hrefNode.nodeValue
107 if href.startswith('page-fragment.xql'):
108 selfurl = self.absolute_url()
109 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization))
110 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl)
111 if href.startswith('../lt/lemma.xql'):
112 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl))
113 l.setAttributeNS(None, 'target', '_blank')
114 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;")
115 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
116 pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']")
117 return serializeNode(pagenode)
118 if (queryType=="ftIndex")or(queryType=="ftIndexMorph"):
119 pagedivs= pagedom.xpath("//div[@class='queryResultPage']")
120 if len(pagedivs)>0:
121 pagenode=pagedivs[0]
122 links=pagenode.xpath("//a")
123 for l in links:
124 hrefNode = l.getAttributeNodeNS(None, u"href")
125 if hrefNode:
126 href = hrefNode.nodeValue
127 hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization))
128 if href.startswith('../lt/lex.xql'):
129 hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl)
130 l.setAttributeNS(None, 'target', '_blank')
131 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;")
132 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
133 if href.startswith('../lt/lemma.xql'):
134 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl))
135 l.setAttributeNS(None, 'target', '_blank')
136 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;")
137 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();')
138 return serializeNode(pagenode)
139 return "no text here"
140
141 def getGisPlaces(self, docinfo=None, pageinfo=None):
142 """ Show all Gis Places of whole Page"""
143 xpath='//place'
144 docpath = docinfo.get('textURLPath',None) 57 docpath = docinfo.get('textURLPath',None)
145 if not docpath: 58 if not docpath:
146 return None 59 return None
147 60
148 url = docinfo['url'] 61 places=[]
149 selfurl = self.absolute_url() 62 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
150 pn = pageinfo['current'] 63 dom = ET.fromstring(text)
151 hrefList=[] 64 result = dom.findall(".//resultPage/place")
152 myList= ""
153 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn))
154 dom = Parse(text)
155 result = dom.xpath("//result/resultPage/place")
156 for l in result: 65 for l in result:
157 hrefNode= l.getAttributeNodeNS(None, u"id") 66 id = l.get("id")
158 href= hrefNode.nodeValue 67 name = l.text
159 hrefList.append(href) 68 place = {'id': id, 'name': name}
160 myList = ",".join(hrefList) 69 places.append(place)
161 #logging.debug("getGisPlaces :%s"%(myList)) 70
162 return myList 71 return places
163 72
164 def getAllGisPlaces (self, docinfo=None, pageinfo=None):
165 """Show all Gis Places of whole Book """
166 xpath ='//echo:place'
167 docpath =docinfo['textURLPath']
168 url = docinfo['url']
169 selfurl =self.absolute_url()
170 pn =pageinfo['current']
171 hrefList=[]
172 myList=""
173 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath))
174 dom =Parse(text)
175 result = dom.xpath("//result/resultPage/place")
176
177 for l in result:
178 hrefNode = l.getAttributeNodeNS(None, u"id")
179 href= hrefNode.nodeValue
180 hrefList.append(href)
181 myList = ",".join(hrefList)
182 #logging.debug("getALLGisPlaces :%s"%(myList))
183 return myList
184 73
74 def processPageInfo(self, dom, docinfo, pageinfo):
75 """processes page info divs from dom and stores in docinfo and pageinfo"""
76 # assume first second level div is pageMeta
77 alldivs = dom.find("div")
78
79 if alldivs is None or alldivs.get('class', '') != 'pageMeta':
80 logging.error("processPageInfo: pageMeta div not found!")
81 return
82
83 for div in alldivs:
84 dc = div.get('class')
85
86 # pageNumberOrig
87 if dc == 'pageNumberOrig':
88 pageinfo['pageNumberOrig'] = div.text
89
90 # pageNumberOrigNorm
91 elif dc == 'pageNumberOrigNorm':
92 pageinfo['pageNumberOrigNorm'] = div.text
93
94 # pageHeaderTitle
95 elif dc == 'pageHeaderTitle':
96 pageinfo['pageHeaderTitle'] = div.text
97
98 # numFigureEntries
99 elif dc == 'countFigureEntries':
100 docinfo['numFigureEntries'] = getInt(div.text)
101
102 # numTocEntries
103 elif dc == 'countTocEntries':
104 # WTF: s1 = int(s)/30+1
105 docinfo['numTocEntries'] = getInt(div.text)
106
107 # numPlaces
108 elif dc == 'countPlaces':
109 docinfo['numPlaces'] = getInt(div.text)
110
111 # numTextPages
112 elif dc == 'countPages':
113 np = getInt(div.text)
114 if np > 0:
115 docinfo['numTextPages'] = np
116 if docinfo.get('numPages', 0) == 0:
117 # seems to be text-only - update page count
118 docinfo['numPages'] = np
119 #pageinfo['end'] = min(pageinfo['end'], np)
120 pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
121 if np % pageinfo['groupsize'] > 0:
122 pageinfo['numgroups'] += 1
123
124 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
125 return
126
185 127
186 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): 128 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
187 """returns single page from fulltext""" 129 """returns single page from fulltext"""
130
131 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
132 # check for cached text -- but ideally this shouldn't be called twice
133 if pageinfo.has_key('textPage'):
134 logging.debug("getTextPage: using cached text")
135 return pageinfo['textPage']
136
188 docpath = docinfo['textURLPath'] 137 docpath = docinfo['textURLPath']
189 path = docinfo['textURLPath'] 138 # just checking
190 url = docinfo.get('url',None) 139 if pageinfo['current'] != pn:
191 name = docinfo.get('name',None) 140 logging.warning("getTextPage: current!=pn!")
192 pn =pageinfo['current'] 141
193 #sn = pageinfo['sn'] 142 # stuff for constructing full urls
194 s = pageinfo['s'] 143 selfurl = docinfo['viewerUrl']
195 highlightElementPos =pageinfo ['highlightElementPos'] 144 textParams = {'document': docpath,
196 highlightElement = pageinfo ['highlightElement'] 145 'pn': pn}
197 #optionToggle =pageinfo ['optionToggle'] 146 if 'characterNormalization' in pageinfo:
198 highlightQuery = pageinfo['highlightQuery'] 147 textParams['characterNormalization'] = pageinfo['characterNormalization']
199 #mode = pageinfo ['viewMode'] 148
200 tocMode = pageinfo['tocMode'] 149 if not mode:
201 xpointer = pageinfo['xpointer'] 150 # default is dict
202 characterNormalization=pageinfo['characterNormalization'] 151 mode = 'text'
203 tocPN = pageinfo['tocPN'] 152
204 selfurl = self.absolute_url() 153 modes = mode.split(',')
205 154 # check for multiple layers
206 if mode == "text_dict": 155 if len(modes) > 1:
207 textmode = "textPollux" 156 logging.debug("getTextPage: more than one mode=%s"%mode)
157
158 # search mode
159 if 'search' in modes:
160 # add highlighting
161 highlightQuery = pageinfo.get('highlightQuery', None)
162 if highlightQuery:
163 textParams['highlightQuery'] = highlightQuery
164 textParams['highlightElement'] = pageinfo.get('highlightElement', '')
165 textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
166
167 # ignore mode in the following
168 modes.remove('search')
169
170 # other modes don't combine
171 if 'dict' in modes:
172 # dict is called textPollux in the backend
173 textmode = 'textPollux'
174 elif len(modes) == 0:
175 # text is default mode
176 textmode = 'text'
208 else: 177 else:
209 textmode = mode 178 # just take first mode
210 179 textmode = modes[0]
211 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s&options=withIdentifier"%(docpath,textmode,pn,characterNormalization, xpointer) 180
212 if highlightQuery is not None: 181 textParams['mode'] = textmode
213 #textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) 182
214 textParam +="&highlightQuery=%s&s=%s&highlightElement=%s&highlightElementPos=%s"%(urllib.quote(highlightQuery),s, highlightElement, highlightElementPos) 183 # fetch the page
215 184 pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
216 pagexml = self.getServerData("page-fragment.xql",textParam) 185 dom = ET.fromstring(pagexml)
217 dom = Parse(pagexml) 186 # extract additional info
218 #dom = NonvalidatingReader.parseStream(pagexml) 187 self.processPageInfo(dom, docinfo, pageinfo)
219 188 # page content is in <div class="pageContent">
220 #original Pages 189 pagediv = None
221 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") 190 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
222 191 # so we look at the second level divs
223 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): 192 alldivs = dom.findall("div")
224 if len(pagedivs)>0: 193 for div in alldivs:
225 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) 194 dc = div.get('class')
226 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) 195 # page content div
227 196 if dc == 'pageContent':
228 #original Pages Norm 197 pagediv = div
229 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") 198 break
230 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"):
231 if len(pagedivs)>0:
232 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0])
233 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm']))
234 """
235 #figureEntries
236 pagedivs = dom.xpath("//div[@class='countFigureEntries']")
237 if pagedivs == dom.xpath("//div[@class='countFigureEntries']"):
238 if len(pagedivs)>0:
239 docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0])
240 s = getTextFromNode(pagedivs[0])
241 if s=='0':
242 try:
243 docinfo['countFigureEntries'] = int(s)
244 except:
245 docinfo['countFigureEntries'] = 0
246 else:
247 s1 = int(s)/30+1
248 try:
249 docinfo['countFigureEntries'] = int(s1)
250 except:
251 docinfo['countFigureEntries'] = 0
252
253 #allPlaces
254 pagedivs = dom.xpath("//div[@class='countPlaces']")
255 if pagedivs == dom.xpath("//div[@class='countPlaces']"):
256 if len(pagedivs)>0:
257 docinfo['countPlaces']= getTextFromNode(pagedivs[0])
258 s = getTextFromNode(pagedivs[0])
259 try:
260 docinfo['countPlaces'] = int(s)
261 except:
262 docinfo['countPlaces'] = 0
263
264 #tocEntries
265 pagedivs = dom.xpath("//div[@class='countTocEntries']")
266 if pagedivs == dom.xpath("//div[@class='countTocEntries']"):
267 if len(pagedivs)>0:
268 docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0]))
269 s = getTextFromNode(pagedivs[0])
270 if s=='0':
271 try:
272 docinfo['countTocEntries'] = int(s)
273 except:
274 docinfo['countTocEntries'] = 0
275 else:
276 s1 = int(s)/30+1
277 try:
278 docinfo['countTocEntries'] = int(s1)
279 except:
280 docinfo['countTocEntries'] = 0
281
282 #numTextPages
283 pagedivs = dom.xpath("//div[@class='countPages']")
284 if pagedivs == dom.xpath("//div[@class='countPages']"):
285 if len(pagedivs)>0:
286 docinfo['numPages'] = getTextFromNode(pagedivs[0])
287 s = getTextFromNode(pagedivs[0])
288
289 try:
290 docinfo['numPages'] = int(s)
291 #logging.debug("PAGE NUMBER: %s"%(s))
292
293 np = docinfo['numPages']
294 pageinfo['end'] = min(pageinfo['end'], np)
295 pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
296 if np % pageinfo['groupsize'] > 0:
297 pageinfo['numgroups'] += 1
298 except:
299 docinfo['numPages'] = 0
300
301 else:
302 #no full text -- init to 0
303 docinfo['pageNumberOrig'] = 0
304 docinfo['countFigureEntries'] = 0
305 docinfo['countPlaces'] = 0
306 docinfo['countTocEntries'] = 0
307 docinfo['numPages'] = 0
308 docinfo['pageNumberOrigNorm'] = 0
309 #return docinfo
310 199
311 # plain text mode 200 # plain text mode
312 if mode == "text": 201 if textmode == "text":
313 # first div contains text 202 # get full url assuming documentViewer is parent
314 pagedivs = dom.xpath("/div") 203 selfurl = self.getLink()
315 if len(pagedivs) > 0: 204 if pagediv is not None:
316 pagenode = pagedivs[0] 205 links = pagediv.findall(".//a")
317 links = pagenode.xpath("//a")
318 for l in links: 206 for l in links:
319 hrefNode = l.getAttributeNodeNS(None, u"href") 207 href = l.get('href')
320 if hrefNode: 208 if href and href.startswith('#note-'):
321 href= hrefNode.nodeValue 209 href = href.replace('#note-',"%s#note-"%selfurl)
322 if href.startswith('#note-'): 210 l.set('href', href)
323 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) 211
324 #if href.startswith(): 212 return serialize(pagediv)
325 return serializeNode(pagenode) 213
326 if mode == "xml": 214 # text-with-links mode
327 # first div contains text 215 elif textmode == "textPollux":
328 pagedivs = dom.xpath("/div") 216 if pagediv is not None:
329 if len(pagedivs) > 0: 217 viewerurl = docinfo['viewerUrl']
330 pagenode = pagedivs[0] 218 selfurl = self.getLink()
331 return serializeNode(pagenode) 219 # check all a-tags
332 if mode == "gis": 220 links = pagediv.findall(".//a")
333 # first div contains text 221 for l in links:
334 pagedivs = dom.xpath("/div") 222 href = l.get('href')
335 if len(pagedivs) > 0:
336 pagenode = pagedivs[0]
337 links =pagenode.xpath("//a")
338 for l in links:
339 hrefNode =l.getAttributeNodeNS(None, u"href")
340 if hrefNode:
341 href=hrefNode.nodeValue
342 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
343 hrefNode.nodeValue =href.replace('db/REST/db/chgis/mpdl','db/RESTdb/db/mpdl/%s'%name)
344 l.setAttributeNS(None, 'target', '_blank')
345 return serializeNode(pagenode)
346 223
347 if mode == "pureXml": 224 if href:
348 # first div contains text
349 pagedivs = dom.xpath("/div")
350 if len(pagedivs) > 0:
351 pagenode = pagedivs[0]
352 return serializeNode(pagenode)
353 # text-with-links mode
354 if mode == "text_dict":
355 # first div contains text
356 #mode = pageinfo ['viewMode']
357 pagedivs = dom.xpath("/div")
358 if len(pagedivs) > 0:
359 pagenode = pagedivs[0]
360 # check all a-tags
361 links = pagenode.xpath("//a")
362
363 for l in links:
364 hrefNode = l.getAttributeNodeNS(None, u"href")
365
366 if hrefNode:
367 # is link with href 225 # is link with href
368 href = hrefNode.nodeValue 226 linkurl = urlparse.urlparse(href)
369 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): 227 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
370 # is pollux link 228 if linkurl.path.endswith('GetDictionaryEntries'):
371 selfurl = self.absolute_url() 229 #TODO: replace wordInfo page
372 # change href 230 # is dictionary link - change href (keeping parameters)
373 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) 231 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
374 # add target 232 # add target to open new page
375 l.setAttributeNS(None, 'target', '_blank') 233 l.set('target', '_blank')
376 #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
377 #l.setAttributeNS(None, "ondblclick", "popupWin.focus();")
378 #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;")
379 234
380 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): 235 # TODO: is this needed?
381 selfurl = self.absolute_url() 236 # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
382 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) 237 # selfurl = self.absolute_url()
383 l.setAttributeNS(None, 'target', '_blank') 238 # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
384 l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=300,height=400,top=180, left=700, toolbar=no, scrollbars=1'); return false;") 239 # l.set('target', '_blank')
385 l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') 240 # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
241 # l.set('ondblclick', 'popupWin.focus();')
386 242
387 if href.startswith('#note-'): 243 if href.startswith('#note-'):
388 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) 244 # note link
245 l.set('href', href.replace('#note-',"%s#note-"%selfurl))
389 246
390 return serializeNode(pagenode) 247 return serialize(pagediv)
391 return "no text here" 248
392 249 # xml mode
393 def getOrigPages(self, docinfo=None, pageinfo=None): 250 elif textmode == "xml":
251 if pagediv is not None:
252 return serialize(pagediv)
253
254 # pureXml mode
255 elif textmode == "pureXml":
256 if pagediv is not None:
257 return serialize(pagediv)
258
259 # gis mode
260 elif textmode == "gis":
261 if pagediv is not None:
262 # check all a-tags
263 links = pagediv.findall(".//a")
264 # add our URL as backlink
265 selfurl = self.getLink()
266 doc = base64.b64encode(selfurl)
267 for l in links:
268 href = l.get('href')
269 if href:
270 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
271 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
272 l.set('target', '_blank')
273
274 return serialize(pagediv)
275
276 return None
277
278
279 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
280 """loads list of search results and stores XML in docinfo"""
281
282 logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
283 if mode == "none":
284 return docinfo
285
286 cachedQuery = docinfo.get('cachedQuery', None)
287 if cachedQuery is not None:
288 # cached search result
289 if cachedQuery == '%s_%s'%(mode,query):
290 # same query
291 return docinfo
292
293 else:
294 # different query
295 del docinfo['resultSize']
296 del docinfo['resultXML']
297
298 # cache query
299 docinfo['cachedQuery'] = '%s_%s'%(mode,query)
300
301 # fetch full results
394 docpath = docinfo['textURLPath'] 302 docpath = docinfo['textURLPath']
395 pn =pageinfo['current'] 303 params = {'document': docpath,
396 selfurl = self.absolute_url() 304 'mode': 'text',
397 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 305 'queryType': mode,
398 dom = Parse(pagexml) 306 'query': query,
399 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") 307 'queryResultPageSize': 1000,
400 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): 308 'queryResultPN': 1,
401 if len(pagedivs)>0: 309 'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
402 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) 310 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
403 return docinfo['pageNumberOrig'] 311 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
404 312 dom = ET.fromstring(pagexml)
405 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): 313 # page content is in <div class="queryResultPage">
406 docpath = docinfo['textURLPath'] 314 pagediv = None
407 pn =pageinfo['current'] 315 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
408 selfurl = self.absolute_url() 316 alldivs = dom.findall("div")
409 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 317 for div in alldivs:
410 dom = Parse(pagexml) 318 dc = div.get('class')
411 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") 319 # page content div
412 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): 320 if dc == 'queryResultPage':
413 if len(pagedivs)>0: 321 pagediv = div
414 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) 322
415 return docinfo['pageNumberOrigNorm'] 323 elif dc == 'queryResultHits':
416 324 docinfo['resultSize'] = getInt(div.text)
417 325
418 def getTranslate(self, word=None, language=None, display=None): 326 if pagediv is not None:
419 """translate into another languages""" 327 # store XML in docinfo
420 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&display="+urllib.quote(display)+"&output=html") 328 docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')
421 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) 329
422 return data 330 return docinfo
423 331
424 def getLemma(self, lemma=None, language=None): 332
425 """simular words lemma """ 333 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
426 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") 334 """returns single page from the table of contents"""
427 return data 335 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
428 336 # check for cached result
429 def getLemmaQuery(self, query=None, language=None): 337 if not 'resultXML' in docinfo:
430 """simular words lemma """ 338 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
431 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") 339
432 return data 340 resultxml = docinfo.get('resultXML', None)
433 341 if not resultxml:
434 def getLex(self, query=None, language=None): 342 logging.error("getResultPage: unable to find resultXML")
435 #simular words lemma 343 return "Error: no result!"
436 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) 344
437 return data 345 if size is None:
438 346 size = pageinfo.get('resultPageSize', 10)
439 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): 347
440 #number of 348 if start is None:
441 docpath = docinfo['textURLPath'] 349 start = (pn - 1) * size
442 pagesize = pageinfo['queryPageSize'] 350
443 pn = pageinfo['searchPN'] 351 fullresult = ET.fromstring(resultxml)
444 query =pageinfo['query'] 352
445 queryType =pageinfo['queryType'] 353 if fullresult is not None:
446 tocSearch = 0 354 # paginate
447 tocDiv = None 355 first = start-1
448 356 len = size
449 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) 357 del fullresult[:first]
450 pagedom = Parse(pagexml) 358 del fullresult[len:]
451 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 359 tocdivs = fullresult
452 tocSearch = int(getTextFromNode(numdivs[0])) 360
453 tc=int((tocSearch/10)+1) 361 # check all a-tags
454 return tc 362 links = tocdivs.findall(".//a")
455 363 for l in links:
364 href = l.get('href')
365 if href:
366 # assume all links go to pages
367 linkUrl = urlparse.urlparse(href)
368 linkParams = urlparse.parse_qs(linkUrl.query)
369 # take some parameters
370 params = {'pn': linkParams['pn'],
371 'highlightQuery': linkParams.get('highlightQuery',''),
372 'highlightElement': linkParams.get('highlightElement',''),
373 'highlightElementPos': linkParams.get('highlightElementPos','')
374 }
375 url = self.getLink(params=params)
376 l.set('href', url)
377
378 return serialize(tocdivs)
379
380 return "ERROR: no results!"
381
382
456 def getToc(self, mode="text", docinfo=None): 383 def getToc(self, mode="text", docinfo=None):
457 """loads table of contents and stores in docinfo""" 384 """loads table of contents and stores XML in docinfo"""
385 logging.debug("getToc mode=%s"%mode)
458 if mode == "none": 386 if mode == "none":
459 return docinfo 387 return docinfo
388
460 if 'tocSize_%s'%mode in docinfo: 389 if 'tocSize_%s'%mode in docinfo:
461 # cached toc 390 # cached toc
462 return docinfo 391 return docinfo
463 392
464 docpath = docinfo['textURLPath'] 393 docpath = docinfo['textURLPath']
470 else: 399 else:
471 queryType = mode 400 queryType = mode
472 # number of entries in toc 401 # number of entries in toc
473 tocSize = 0 402 tocSize = 0
474 tocDiv = None 403 tocDiv = None
475 404 # fetch full toc
476 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 405 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
477 406 dom = ET.fromstring(pagexml)
478 # post-processing downloaded xml 407 # page content is in <div class="queryResultPage">
479 pagedom = Parse(pagexml) 408 pagediv = None
480 # get number of entries 409 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
481 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 410 alldivs = dom.findall("div")
482 if len(numdivs) > 0: 411 for div in alldivs:
483 tocSize = int(getTextFromNode(numdivs[0])) 412 dc = div.get('class')
484 docinfo['tocSize_%s'%mode] = tocSize 413 # page content div
414 if dc == 'queryResultPage':
415 pagediv = div
416
417 elif dc == 'queryResultHits':
418 docinfo['tocSize_%s'%mode] = getInt(div.text)
419
420 if pagediv is not None:
421 # store XML in docinfo
422 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
423
485 return docinfo 424 return docinfo
486 425
487 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): 426 def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
488 """returns single page from the table of contents""" 427 """returns single page from the table of contents"""
489 # TODO: this should use the cached TOC 428 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
490 if mode == "text": 429 if mode == "text":
491 queryType = "toc" 430 queryType = "toc"
492 else: 431 else:
493 queryType = mode 432 queryType = mode
494 docpath = docinfo['textURLPath'] 433
495 path = docinfo['textURLPath'] 434 # check for cached TOC
496 pagesize = pageinfo['tocPageSize'] 435 if not docinfo.has_key('tocXML_%s'%mode):
497 pn = pageinfo['tocPN'] 436 self.getToc(mode=mode, docinfo=docinfo)
498 url = docinfo['url'] 437
499 selfurl = self.absolute_url() 438 tocxml = docinfo.get('tocXML_%s'%mode, None)
500 viewMode= pageinfo['viewMode'] 439 if not tocxml:
501 characterNormalization = pageinfo ['characterNormalization'] 440 logging.error("getTocPage: unable to find tocXML")
502 #optionToggle =pageinfo ['optionToggle'] 441 return "Error: no table of contents!"
503 tocMode = pageinfo['tocMode'] 442
504 tocPN = pageinfo['tocPN'] 443 if size is None:
505 444 size = pageinfo.get('tocPageSize', 30)
506 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) 445
507 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) 446 if start is None:
508 text = page.replace('mode=image','mode=texttool') 447 start = (pn - 1) * size
509 return text 448
449 fulltoc = ET.fromstring(tocxml)
450
451 if fulltoc is not None:
452 # paginate
453 first = (start - 1) * 2
454 len = size * 2
455 del fulltoc[:first]
456 del fulltoc[len:]
457 tocdivs = fulltoc
458
459 # check all a-tags
460 links = tocdivs.findall(".//a")
461 for l in links:
462 href = l.get('href')
463 if href:
464 # take pn from href
465 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
466 if m is not None:
467 # and create new url (assuming parent is documentViewer)
468 url = self.getLink('pn', m.group(1))
469 l.set('href', url)
470 else:
471 logging.warning("getTocPage: Problem with link=%s"%href)
472
473 # fix two-divs-per-row with containing div
474 newtoc = ET.Element('div', {'class':'queryResultPage'})
475 for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
476 e = ET.Element('div',{'class':'tocline'})
477 e.append(d1)
478 e.append(d2)
479 newtoc.append(e)
480
481 return serialize(newtoc)
482
483 return "ERROR: no table of contents!"
484
510 485
511 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 486 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
512 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
513 """change settings""" 487 """change settings"""
514 self.title=title 488 self.title=title
515 self.timeout = timeout 489 self.timeout = timeout
516 self.serverUrl = serverUrl 490 self.serverUrl = serverUrl
517 if RESPONSE is not None: 491 if RESPONSE is not None:
528 """add zogiimage""" 502 """add zogiimage"""
529 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) 503 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
530 self.Destination()._setObject(id, newObj) 504 self.Destination()._setObject(id, newObj)
531 if RESPONSE is not None: 505 if RESPONSE is not None:
532 RESPONSE.redirect('manage_main') 506 RESPONSE.redirect('manage_main')
507
508