Mercurial > hg > documentViewer
annotate MpdlXmlTextServer.py @ 513:67095296c95a
Merge from elementtree branch
92a6443a6f16ff25674d43814ec0d6c0a43a5e1a
author | casties |
---|---|
date | Tue, 28 Feb 2012 19:10:08 +0100 |
parents | 91daab0c219b 551ca1641a5e |
children | 7d7b639d7be7 |
rev | line source |
---|---|
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
1 from OFS.SimpleItem import SimpleItem |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
455 | 3 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
4 import xml.etree.ElementTree as ET |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
5 |
455 | 6 import re |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
7 import logging |
134 | 8 import urllib |
511 | 9 import urlparse |
506 | 10 import base64 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
11 |
458 | 12 from SrvTxtUtils import getInt, getText, getHttpData |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
13 |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
14 def serialize(node): |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
15 """returns a string containing an XML snippet of node""" |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
16 s = ET.tostring(node, 'UTF-8') |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
17 # snip off XML declaration |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
18 if s.startswith('<?xml'): |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
19 i = s.find('?>') |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
20 return s[i+3:] |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
21 |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
22 return s |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
23 |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
24 |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
25 class MpdlXmlTextServer(SimpleItem): |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
26 """TextServer implementation for MPDL-XML eXist server""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
27 meta_type="MPDL-XML TextServer" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
28 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
29 manage_options=( |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
30 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
31 )+SimpleItem.manage_options |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
32 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
33 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
34 |
455 | 35 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
36 """constructor""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
37 self.id=id |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
38 self.title=title |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
39 self.timeout = timeout |
132 | 40 if serverName is None: |
41 self.serverUrl = serverUrl | |
42 else: | |
43 self.serverUrl = "http://%s/mpdl/interface/"%serverName | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
44 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
45 def getHttpData(self, url, data=None): |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
46 """returns result from url+data HTTP request""" |
458 | 47 return getHttpData(url,data,timeout=self.timeout) |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
48 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
49 def getServerData(self, method, data=None): |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
50 """returns result from text server for method+data""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
51 url = self.serverUrl+method |
458 | 52 return getHttpData(url,data,timeout=self.timeout) |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
53 |
506 | 54 |
55 def getPlacesOnPage(self, docinfo=None, pn=None): | |
56 """Returns list of GIS places of page pn""" | |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
57 docpath = docinfo.get('textURLPath',None) |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
58 if not docpath: |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
59 return None |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
60 |
506 | 61 places=[] |
62 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) | |
455 | 63 dom = ET.fromstring(text) |
506 | 64 result = dom.findall(".//resultPage/place") |
236 | 65 for l in result: |
506 | 66 id = l.get("id") |
67 name = l.text | |
68 place = {'id': id, 'name': name} | |
69 places.append(place) | |
70 | |
71 return places | |
307 | 72 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
73 |
455 | 74 def processPageInfo(self, dom, docinfo, pageinfo): |
75 """processes page info divs from dom and stores in docinfo and pageinfo""" | |
469 | 76 # assume first second level div is pageMeta |
77 alldivs = dom.find("div") | |
473 | 78 |
79 if alldivs is None or alldivs.get('class', '') != 'pageMeta': | |
80 logging.error("processPageInfo: pageMeta div not found!") | |
81 return | |
82 | |
455 | 83 for div in alldivs: |
84 dc = div.get('class') | |
85 | |
86 # pageNumberOrig | |
469 | 87 if dc == 'pageNumberOrig': |
455 | 88 pageinfo['pageNumberOrig'] = div.text |
89 | |
90 # pageNumberOrigNorm | |
91 elif dc == 'pageNumberOrigNorm': | |
92 pageinfo['pageNumberOrigNorm'] = div.text | |
93 | |
474 | 94 # pageHeaderTitle |
95 elif dc == 'pageHeaderTitle': | |
96 pageinfo['pageHeaderTitle'] = div.text | |
97 | |
98 # numFigureEntries | |
455 | 99 elif dc == 'countFigureEntries': |
469 | 100 docinfo['numFigureEntries'] = getInt(div.text) |
455 | 101 |
474 | 102 # numTocEntries |
455 | 103 elif dc == 'countTocEntries': |
104 # WTF: s1 = int(s)/30+1 | |
469 | 105 docinfo['numTocEntries'] = getInt(div.text) |
106 | |
475 | 107 # numPlaces |
108 elif dc == 'countPlaces': | |
109 docinfo['numPlaces'] = getInt(div.text) | |
110 | |
455 | 111 # numTextPages |
112 elif dc == 'countPages': | |
458 | 113 np = getInt(div.text) |
455 | 114 if np > 0: |
115 docinfo['numTextPages'] = np | |
116 if docinfo.get('numPages', 0) == 0: | |
465 | 117 # seems to be text-only - update page count |
118 docinfo['numPages'] = np | |
477 | 119 #pageinfo['end'] = min(pageinfo['end'], np) |
455 | 120 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
121 if np % pageinfo['groupsize'] > 0: | |
122 pageinfo['numgroups'] += 1 | |
473 | 123 |
124 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) | |
455 | 125 return |
126 | |
388 | 127 |
471 | 128 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
129 """returns single page from fulltext""" |
508 | 130 |
455 | 131 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) |
478 | 132 # check for cached text -- but ideally this shouldn't be called twice |
455 | 133 if pageinfo.has_key('textPage'): |
134 logging.debug("getTextPage: using cached text") | |
135 return pageinfo['textPage'] | |
136 | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
137 docpath = docinfo['textURLPath'] |
455 | 138 # just checking |
139 if pageinfo['current'] != pn: | |
140 logging.warning("getTextPage: current!=pn!") | |
141 | |
142 # stuff for constructing full urls | |
478 | 143 selfurl = docinfo['viewerUrl'] |
511 | 144 textParams = {'document': docpath, |
145 'pn': pn} | |
146 if 'characterNormalization' in pageinfo: | |
147 textParams['characterNormalization'] = pageinfo['characterNormalization'] | |
455 | 148 |
508 | 149 if not mode: |
150 # default is dict | |
151 mode = 'text' | |
152 | |
153 modes = mode.split(',') | |
154 # check for multiple layers | |
155 if len(modes) > 1: | |
156 logging.debug("getTextPage: more than one mode=%s"%mode) | |
157 | |
158 # search mode | |
159 if 'search' in modes: | |
160 # add highlighting | |
161 highlightQuery = pageinfo.get('highlightQuery', None) | |
511 | 162 if highlightQuery: |
163 textParams['highlightQuery'] = highlightQuery | |
164 textParams['highlightElement'] = pageinfo.get('highlightElement', '') | |
165 textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') | |
508 | 166 |
511 | 167 # ignore mode in the following |
508 | 168 modes.remove('search') |
169 | |
170 # other modes don't combine | |
171 if 'dict' in modes: | |
473 | 172 # dict is called textPollux in the backend |
508 | 173 textmode = 'textPollux' |
174 elif len(modes) == 0: | |
175 # text is default mode | |
176 textmode = 'text' | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
177 else: |
508 | 178 # just take first mode |
179 textmode = modes[0] | |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
180 |
511 | 181 textParams['mode'] = textmode |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
182 |
455 | 183 # fetch the page |
511 | 184 pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
185 dom = ET.fromstring(pagexml) |
455 | 186 # extract additional info |
187 self.processPageInfo(dom, docinfo, pageinfo) | |
188 # page content is in <div class="pageContent"> | |
189 pagediv = None | |
190 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] | |
469 | 191 # so we look at the second level divs |
192 alldivs = dom.findall("div") | |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
193 for div in alldivs: |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
194 dc = div.get('class') |
455 | 195 # page content div |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
196 if dc == 'pageContent': |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
197 pagediv = div |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
198 break |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
199 |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
200 # plain text mode |
508 | 201 if textmode == "text": |
478 | 202 # get full url assuming documentViewer is parent |
203 selfurl = self.getLink() | |
473 | 204 if pagediv is not None: |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
205 links = pagediv.findall(".//a") |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
206 for l in links: |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
207 href = l.get('href') |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
208 if href and href.startswith('#note-'): |
478 | 209 href = href.replace('#note-',"%s#note-"%selfurl) |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
210 l.set('href', href) |
455 | 211 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
212 return serialize(pagediv) |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
213 |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
214 # text-with-links mode |
508 | 215 elif textmode == "textPollux": |
473 | 216 if pagediv is not None: |
478 | 217 viewerurl = docinfo['viewerUrl'] |
218 selfurl = self.getLink() | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
219 # check all a-tags |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
220 links = pagediv.findall(".//a") |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
221 for l in links: |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
222 href = l.get('href') |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
223 |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
224 if href: |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
225 # is link with href |
511 | 226 linkurl = urlparse.urlparse(href) |
227 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) | |
228 if linkurl.path.endswith('GetDictionaryEntries'): | |
229 #TODO: replace wordInfo page | |
478 | 230 # is dictionary link - change href (keeping parameters) |
511 | 231 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) |
478 | 232 # add target to open new page |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
233 l.set('target', '_blank') |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
234 |
478 | 235 # TODO: is this needed? |
511 | 236 # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): |
237 # selfurl = self.absolute_url() | |
238 # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) | |
239 # l.set('target', '_blank') | |
240 # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") | |
241 # l.set('ondblclick', 'popupWin.focus();') | |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
242 |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
243 if href.startswith('#note-'): |
478 | 244 # note link |
245 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) | |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
246 |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
247 return serialize(pagediv) |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
248 |
455 | 249 # xml mode |
508 | 250 elif textmode == "xml": |
473 | 251 if pagediv is not None: |
455 | 252 return serialize(pagediv) |
253 | |
254 # pureXml mode | |
508 | 255 elif textmode == "pureXml": |
473 | 256 if pagediv is not None: |
455 | 257 return serialize(pagediv) |
258 | |
259 # gis mode | |
508 | 260 elif textmode == "gis": |
473 | 261 if pagediv is not None: |
455 | 262 # check all a-tags |
263 links = pagediv.findall(".//a") | |
506 | 264 # add our URL as backlink |
265 selfurl = self.getLink() | |
266 doc = base64.b64encode(selfurl) | |
455 | 267 for l in links: |
268 href = l.get('href') | |
269 if href: | |
506 | 270 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): |
271 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) | |
272 l.set('target', '_blank') | |
455 | 273 |
274 return serialize(pagediv) | |
275 | |
501 | 276 return None |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
277 |
455 | 278 |
509
9d05befdd462
try to get characterNormalization in search result working.
casties
parents:
508
diff
changeset
|
279 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): |
508 | 280 """loads list of search results and stores XML in docinfo""" |
511 | 281 |
508 | 282 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) |
283 if mode == "none": | |
284 return docinfo | |
285 | |
511 | 286 cachedQuery = docinfo.get('cachedQuery', None) |
287 if cachedQuery is not None: | |
288 # cached search result | |
289 if cachedQuery == '%s_%s'%(mode,query): | |
290 # same query | |
291 return docinfo | |
292 | |
293 else: | |
294 # different query | |
295 del docinfo['resultSize'] | |
296 del docinfo['resultXML'] | |
508 | 297 |
511 | 298 # cache query |
299 docinfo['cachedQuery'] = '%s_%s'%(mode,query) | |
509
9d05befdd462
try to get characterNormalization in search result working.
casties
parents:
508
diff
changeset
|
300 |
508 | 301 # fetch full results |
511 | 302 docpath = docinfo['textURLPath'] |
508 | 303 params = {'document': docpath, |
304 'mode': 'text', | |
305 'queryType': mode, | |
306 'query': query, | |
307 'queryResultPageSize': 1000, | |
509
9d05befdd462
try to get characterNormalization in search result working.
casties
parents:
508
diff
changeset
|
308 'queryResultPN': 1, |
9d05befdd462
try to get characterNormalization in search result working.
casties
parents:
508
diff
changeset
|
309 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} |
508 | 310 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) |
311 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) | |
312 dom = ET.fromstring(pagexml) | |
313 # page content is in <div class="queryResultPage"> | |
314 pagediv = None | |
315 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] | |
316 alldivs = dom.findall("div") | |
317 for div in alldivs: | |
318 dc = div.get('class') | |
319 # page content div | |
320 if dc == 'queryResultPage': | |
321 pagediv = div | |
322 | |
323 elif dc == 'queryResultHits': | |
511 | 324 docinfo['resultSize'] = getInt(div.text) |
508 | 325 |
510 | 326 if pagediv is not None: |
508 | 327 # store XML in docinfo |
511 | 328 docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') |
508 | 329 |
330 return docinfo | |
331 | |
332 | |
333 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): | |
334 """returns single page from the table of contents""" | |
335 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) | |
511 | 336 # check for cached result |
337 if not 'resultXML' in docinfo: | |
509
9d05befdd462
try to get characterNormalization in search result working.
casties
parents:
508
diff
changeset
|
338 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) |
508 | 339 |
511 | 340 resultxml = docinfo.get('resultXML', None) |
508 | 341 if not resultxml: |
342 logging.error("getResultPage: unable to find resultXML") | |
343 return "Error: no result!" | |
344 | |
345 if size is None: | |
511 | 346 size = pageinfo.get('resultPageSize', 10) |
508 | 347 |
348 if start is None: | |
349 start = (pn - 1) * size | |
350 | |
351 fullresult = ET.fromstring(resultxml) | |
352 | |
510 | 353 if fullresult is not None: |
508 | 354 # paginate |
511 | 355 first = start-1 |
508 | 356 len = size |
357 del fullresult[:first] | |
358 del fullresult[len:] | |
359 tocdivs = fullresult | |
360 | |
361 # check all a-tags | |
362 links = tocdivs.findall(".//a") | |
363 for l in links: | |
364 href = l.get('href') | |
365 if href: | |
511 | 366 # assume all links go to pages |
367 linkUrl = urlparse.urlparse(href) | |
368 linkParams = urlparse.parse_qs(linkUrl.query) | |
369 # take some parameters | |
370 params = {'pn': linkParams['pn'], | |
371 'highlightQuery': linkParams.get('highlightQuery',''), | |
372 'highlightElement': linkParams.get('highlightElement',''), | |
373 'highlightElementPos': linkParams.get('highlightElementPos','') | |
374 } | |
375 url = self.getLink(params=params) | |
376 l.set('href', url) | |
508 | 377 |
378 return serialize(tocdivs) | |
379 | |
380 return "ERROR: no results!" | |
381 | |
382 | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
383 def getToc(self, mode="text", docinfo=None): |
455 | 384 """loads table of contents and stores XML in docinfo""" |
385 logging.debug("getToc mode=%s"%mode) | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
386 if mode == "none": |
455 | 387 return docinfo |
388 | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
389 if 'tocSize_%s'%mode in docinfo: |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
390 # cached toc |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
391 return docinfo |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
392 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
393 docpath = docinfo['textURLPath'] |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
394 # we need to set a result set size |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
395 pagesize = 1000 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
396 pn = 1 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
397 if mode == "text": |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
398 queryType = "toc" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
399 else: |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
400 queryType = mode |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
401 # number of entries in toc |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
402 tocSize = 0 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
403 tocDiv = None |
455 | 404 # fetch full toc |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
405 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) |
455 | 406 dom = ET.fromstring(pagexml) |
407 # page content is in <div class="queryResultPage"> | |
408 pagediv = None | |
409 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] | |
410 alldivs = dom.findall("div") | |
411 for div in alldivs: | |
412 dc = div.get('class') | |
413 # page content div | |
414 if dc == 'queryResultPage': | |
415 pagediv = div | |
416 | |
417 elif dc == 'queryResultHits': | |
458 | 418 docinfo['tocSize_%s'%mode] = getInt(div.text) |
455 | 419 |
510 | 420 if pagediv is not None: |
455 | 421 # store XML in docinfo |
422 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') | |
423 | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
424 return docinfo |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
425 |
482
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
426 def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None): |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
427 """returns single page from the table of contents""" |
455 | 428 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
429 if mode == "text": |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
430 queryType = "toc" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
431 else: |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
432 queryType = mode |
455 | 433 |
434 # check for cached TOC | |
435 if not docinfo.has_key('tocXML_%s'%mode): | |
436 self.getToc(mode=mode, docinfo=docinfo) | |
437 | |
438 tocxml = docinfo.get('tocXML_%s'%mode, None) | |
439 if not tocxml: | |
440 logging.error("getTocPage: unable to find tocXML") | |
482
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
441 return "Error: no table of contents!" |
455 | 442 |
482
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
443 if size is None: |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
444 size = pageinfo.get('tocPageSize', 30) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
445 |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
446 if start is None: |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
447 start = (pn - 1) * size |
455 | 448 |
449 fulltoc = ET.fromstring(tocxml) | |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
450 |
510 | 451 if fulltoc is not None: |
455 | 452 # paginate |
482
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
453 first = (start - 1) * 2 |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
454 len = size * 2 |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
455 del fulltoc[:first] |
456 | 456 del fulltoc[len:] |
455 | 457 tocdivs = fulltoc |
458 | |
459 # check all a-tags | |
460 links = tocdivs.findall(".//a") | |
461 for l in links: | |
462 href = l.get('href') | |
463 if href: | |
464 # take pn from href | |
465 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) | |
466 if m is not None: | |
476 | 467 # and create new url (assuming parent is documentViewer) |
468 url = self.getLink('pn', m.group(1)) | |
469 l.set('href', url) | |
455 | 470 else: |
471 logging.warning("getTocPage: Problem with link=%s"%href) | |
472 | |
482
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
473 # fix two-divs-per-row with containing div |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
474 newtoc = ET.Element('div', {'class':'queryResultPage'}) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
475 for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
476 e = ET.Element('div',{'class':'tocline'}) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
477 e.append(d1) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
478 e.append(d2) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
479 newtoc.append(e) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
480 |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
481 return serialize(newtoc) |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
482 |
7ca8ac7db06e
more new template stuff. more batching methods in documentViewer.
casties
parents:
478
diff
changeset
|
483 return "ERROR: no table of contents!" |
455 | 484 |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
485 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
486 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
487 """change settings""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
488 self.title=title |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
489 self.timeout = timeout |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
490 self.serverUrl = serverUrl |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
491 if RESPONSE is not None: |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
492 RESPONSE.redirect('manage_main') |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
493 |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
494 # management methods |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
495 def manage_addMpdlXmlTextServerForm(self): |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
496 """Form for adding""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
497 pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
498 return pt() |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
499 |
453
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
500 def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): |
beb7ccb92564
first version using elementtree instead of 4suite xml
casties
parents:
407
diff
changeset
|
501 #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): |
129
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
502 """add zogiimage""" |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
503 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
504 self.Destination()._setObject(id, newObj) |
9404b6c37920
more modular version with separate object MpdlXmlTextServer
casties
parents:
diff
changeset
|
505 if RESPONSE is not None: |
455 | 506 RESPONSE.redirect('manage_main') |
507 | |
508 |