Mercurial > hg > documentViewer
comparison MpdlXmlTextServer.py @ 455:0a53fea83df7 elementtree
more work renovating
author | casties |
---|---|
date | Fri, 15 Jul 2011 21:34:41 +0200 |
parents | beb7ccb92564 |
children | b27a7d2f06ff |
comparison
equal
deleted
inserted
replaced
454:73e3273c7624 | 455:0a53fea83df7 |
---|---|
1 | 1 |
2 from OFS.SimpleItem import SimpleItem | 2 from OFS.SimpleItem import SimpleItem |
3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile | 3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
4 | |
4 from Ft.Xml import EMPTY_NAMESPACE, Parse | 5 from Ft.Xml import EMPTY_NAMESPACE, Parse |
5 from Ft.Xml.Domlette import NonvalidatingReader | 6 from Ft.Xml.Domlette import NonvalidatingReader |
6 import Ft.Xml.Domlette | 7 import Ft.Xml.Domlette |
7 import cStringIO | 8 import cStringIO |
8 | 9 |
9 import xml.etree.ElementTree as ET | 10 import xml.etree.ElementTree as ET |
10 | 11 |
11 import md5 | 12 import re |
12 import sys | |
13 import logging | 13 import logging |
14 import urllib | 14 import urllib |
15 import documentViewer | 15 import documentViewer |
16 #from documentViewer import getTextFromNode, serializeNode | 16 #from documentViewer import getTextFromNode, serializeNode |
17 | |
18 def intOr0(s, default=0): | |
19 """convert s to int or return default""" | |
20 try: | |
21 return int(s) | |
22 except: | |
23 return default | |
17 | 24 |
18 def getText(node): | 25 def getText(node): |
19 """get the cdata content of a node""" | 26 """get the cdata content of a node""" |
20 if node is None: | 27 if node is None: |
21 return "" | 28 return "" |
42 def getTextFromNode(node): | 49 def getTextFromNode(node): |
43 """get the cdata content of a node""" | 50 """get the cdata content of a node""" |
44 if node is None: | 51 if node is None: |
45 return "" | 52 return "" |
46 # ET: | 53 # ET: |
47 #text = node.text or "" | 54 # text = node.text or "" |
48 #for e in node: | 55 # for e in node: |
49 # text += gettext(e) | 56 # text += gettext(e) |
50 # if e.tail: | 57 # if e.tail: |
51 # text += e.tail | 58 # text += e.tail |
52 | 59 |
53 # 4Suite: | 60 # 4Suite: |
54 nodelist=node.childNodes | 61 nodelist=node.childNodes |
55 text = "" | 62 text = "" |
56 for n in nodelist: | 63 for n in nodelist: |
80 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, | 87 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, |
81 )+SimpleItem.manage_options | 88 )+SimpleItem.manage_options |
82 | 89 |
83 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) | 90 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
84 | 91 |
85 def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): | 92 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
86 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): | |
87 | 93 |
88 """constructor""" | 94 """constructor""" |
89 self.id=id | 95 self.id=id |
90 self.title=title | 96 self.title=title |
91 self.timeout = timeout | 97 self.timeout = timeout |
101 def getServerData(self, method, data=None): | 107 def getServerData(self, method, data=None): |
102 """returns result from text server for method+data""" | 108 """returns result from text server for method+data""" |
103 url = self.serverUrl+method | 109 url = self.serverUrl+method |
104 return documentViewer.getHttpData(url,data,timeout=self.timeout) | 110 return documentViewer.getHttpData(url,data,timeout=self.timeout) |
105 | 111 |
112 # WTF: what does this really do? can it be integrated in getPage? | |
106 def getSearch(self, pageinfo=None, docinfo=None): | 113 def getSearch(self, pageinfo=None, docinfo=None): |
107 """get search list""" | 114 """get search list""" |
115 logging.debug("getSearch()") | |
108 docpath = docinfo['textURLPath'] | 116 docpath = docinfo['textURLPath'] |
109 url = docinfo['url'] | 117 url = docinfo['url'] |
110 pagesize = pageinfo['queryPageSize'] | 118 pagesize = pageinfo['queryPageSize'] |
111 pn = pageinfo.get('searchPN',1) | 119 pn = pageinfo.get('searchPN',1) |
112 sn = pageinfo['sn'] | 120 sn = pageinfo['sn'] |
205 selfurl = self.absolute_url() | 213 selfurl = self.absolute_url() |
206 pn = pageinfo['current'] | 214 pn = pageinfo['current'] |
207 hrefList=[] | 215 hrefList=[] |
208 myList= "" | 216 myList= "" |
209 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) | 217 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) |
210 dom = Parse(text) | 218 dom = ET.fromstring(text) |
211 result = dom.xpath("//result/resultPage/place") | 219 result = dom.findall(".//result/resultPage/place") |
212 for l in result: | 220 for l in result: |
213 hrefNode= l.getAttributeNodeNS(None, u"id") | 221 href = l.get("id") |
214 href= hrefNode.nodeValue | |
215 hrefList.append(href) | 222 hrefList.append(href) |
223 # WTF: what does this do? | |
216 myList = ",".join(hrefList) | 224 myList = ",".join(hrefList) |
217 #logging.debug("getGisPlaces :%s"%(myList)) | 225 #logging.debug("getGisPlaces :%s"%(myList)) |
218 return myList | 226 return myList |
219 | 227 |
220 def getAllGisPlaces (self, docinfo=None, pageinfo=None): | 228 def getAllGisPlaces (self, docinfo=None, pageinfo=None): |
225 selfurl =self.absolute_url() | 233 selfurl =self.absolute_url() |
226 pn =pageinfo['current'] | 234 pn =pageinfo['current'] |
227 hrefList=[] | 235 hrefList=[] |
228 myList="" | 236 myList="" |
229 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) | 237 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) |
230 dom =Parse(text) | 238 dom = ET.fromstring(text) |
231 result = dom.xpath("//result/resultPage/place") | 239 result = dom.findall(".//result/resultPage/place") |
232 | 240 |
233 for l in result: | 241 for l in result: |
234 hrefNode = l.getAttributeNodeNS(None, u"id") | 242 href = l.get("id") |
235 href= hrefNode.nodeValue | |
236 hrefList.append(href) | 243 hrefList.append(href) |
244 # WTF: what does this do? | |
237 myList = ",".join(hrefList) | 245 myList = ",".join(hrefList) |
238 #logging.debug("getALLGisPlaces :%s"%(myList)) | 246 #logging.debug("getALLGisPlaces :%s"%(myList)) |
239 return myList | 247 return myList |
240 | 248 |
241 | 249 def processPageInfo(self, dom, docinfo, pageinfo): |
242 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): | 250 """processes page info divs from dom and stores in docinfo and pageinfo""" |
243 """returns single page from fulltext""" | 251 # process all toplevel divs |
244 docpath = docinfo['textURLPath'] | |
245 path = docinfo['textURLPath'] | |
246 url = docinfo.get('url',None) | |
247 name = docinfo.get('name',None) | |
248 pn =pageinfo['current'] | |
249 sn = pageinfo['sn'] | |
250 #optionToggle =pageinfo ['optionToggle'] | |
251 highlightQuery = pageinfo['highlightQuery'] | |
252 #mode = pageinfo ['viewMode'] | |
253 tocMode = pageinfo['tocMode'] | |
254 characterNormalization=pageinfo['characterNormalization'] | |
255 tocPN = pageinfo['tocPN'] | |
256 selfurl = self.absolute_url() | |
257 if mode == "text_dict": | |
258 textmode = "textPollux" | |
259 else: | |
260 textmode = mode | |
261 | |
262 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) | |
263 if highlightQuery is not None: | |
264 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) | |
265 | |
266 pagexml = self.getServerData("page-fragment.xql",textParam) | |
267 dom = ET.fromstring(pagexml) | |
268 #dom = NonvalidatingReader.parseStream(pagexml) | |
269 | |
270 #original Pages | |
271 #pagedivs = dom.xpath("//div[@class='pageNumberOrig']") | |
272 | |
273 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): | |
274 if len(pagedivs)>0: | |
275 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) | |
276 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) | |
277 | |
278 #original Pages Norm | |
279 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") | |
280 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): | |
281 if len(pagedivs)>0: | |
282 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) | |
283 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) | |
284 """ | |
285 #figureEntries | |
286 # pagedivs = dom.xpath("//div[@class='countFigureEntries']") | |
287 # if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): | |
288 # if len(pagedivs)>0: | |
289 # docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) | |
290 # s = getTextFromNode(pagedivs[0]) | |
291 # if s=='0': | |
292 # try: | |
293 # docinfo['countFigureEntries'] = int(s) | |
294 # except: | |
295 # docinfo['countFigureEntries'] = 0 | |
296 # else: | |
297 # s1 = int(s)/30+1 | |
298 # try: | |
299 # docinfo['countFigureEntries'] = int(s1) | |
300 # except: | |
301 # docinfo['countFigureEntries'] = 0 | |
302 # | |
303 # #allPlaces | |
304 # pagedivs = dom.xpath("//div[@class='countPlaces']") | |
305 # if pagedivs == dom.xpath("//div[@class='countPlaces']"): | |
306 # if len(pagedivs)>0: | |
307 # docinfo['countPlaces']= getTextFromNode(pagedivs[0]) | |
308 # s = getTextFromNode(pagedivs[0]) | |
309 # try: | |
310 # docinfo['countPlaces'] = int(s) | |
311 # except: | |
312 # docinfo['countPlaces'] = 0 | |
313 # | |
314 # #tocEntries | |
315 # pagedivs = dom.xpath("//div[@class='countTocEntries']") | |
316 # if pagedivs == dom.xpath("//div[@class='countTocEntries']"): | |
317 # if len(pagedivs)>0: | |
318 # docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) | |
319 # s = getTextFromNode(pagedivs[0]) | |
320 # if s=='0': | |
321 # try: | |
322 # docinfo['countTocEntries'] = int(s) | |
323 # except: | |
324 # docinfo['countTocEntries'] = 0 | |
325 # else: | |
326 # s1 = int(s)/30+1 | |
327 # try: | |
328 # docinfo['countTocEntries'] = int(s1) | |
329 # except: | |
330 # docinfo['countTocEntries'] = 0 | |
331 | |
332 #numTextPages | |
333 #pagedivs = dom.xpath("//div[@class='countPages']") | |
334 alldivs = dom.findall(".//div") | 252 alldivs = dom.findall(".//div") |
335 pagediv = None | 253 pagediv = None |
336 for div in alldivs: | 254 for div in alldivs: |
337 dc = div.get('class') | 255 dc = div.get('class') |
256 | |
257 # page content div | |
338 if dc == 'pageContent': | 258 if dc == 'pageContent': |
339 pagediv = div | 259 pagediv = div |
260 | |
261 # pageNumberOrig | |
262 elif dc == 'pageNumberOrig': | |
263 pageinfo['pageNumberOrig'] = div.text | |
340 | 264 |
341 if dc == 'countPages': | 265 # pageNumberOrigNorm |
342 try: | 266 elif dc == 'pageNumberOrigNorm': |
343 np = int(div.text) | 267 pageinfo['pageNumberOrigNorm'] = div.text |
344 docinfo['numPages'] = np | 268 |
345 pageinfo['end'] = min(pageinfo['end'], np) | 269 # pageNumberOrigNorm |
346 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) | 270 elif dc == 'countFigureEntries': |
347 if np % pageinfo['groupsize'] > 0: | 271 docinfo['countFigureEntries'] = intOr0(div.text) |
348 pageinfo['numgroups'] += 1 | 272 |
349 | 273 # pageNumberOrigNorm |
350 except: | 274 elif dc == 'countTocEntries': |
351 docinfo['numPages'] = 0 | 275 # WTF: s1 = int(s)/30+1 |
352 | 276 docinfo['countTocEntries'] = intOr0(div.text) |
277 | |
278 # numTextPages | |
279 elif dc == 'countPages': | |
280 np = intOr0(div.text) | |
281 if np > 0: | |
282 docinfo['numTextPages'] = np | |
283 if docinfo.get('numPages', 0) == 0: | |
284 # seems to be text-only | |
285 docinfo['numTextPages'] = np | |
286 pageinfo['end'] = min(pageinfo['end'], np) | |
287 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) | |
288 if np % pageinfo['groupsize'] > 0: | |
289 pageinfo['numgroups'] += 1 | |
290 | |
291 return | |
292 | |
293 | |
294 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): | |
295 """returns single page from fulltext""" | |
296 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) | |
297 # check for cached text -- but this shouldn't be called twice | |
298 if pageinfo.has_key('textPage'): | |
299 logging.debug("getTextPage: using cached text") | |
300 return pageinfo['textPage'] | |
301 | |
302 docpath = docinfo['textURLPath'] | |
303 # just checking | |
304 if pageinfo['current'] != pn: | |
305 logging.warning("getTextPage: current!=pn!") | |
306 | |
307 # stuff for constructing full urls | |
308 url = docinfo['url'] | |
309 urlmode = docinfo['mode'] | |
310 sn = pageinfo.get('sn', None) | |
311 highlightQuery = pageinfo.get('highlightQuery', None) | |
312 tocMode = pageinfo.get('tocMode', None) | |
313 tocPN = pageinfo.get('tocPN',None) | |
314 characterNormalization = pageinfo.get('characterNormalization', None) | |
315 selfurl = docinfo['viewerUrl'] | |
316 | |
317 if mode == "text_dict": | |
318 # text_dict is called textPollux in the backend | |
319 textmode = "textPollux" | |
320 else: | |
321 textmode = mode | |
322 | |
323 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) | |
324 if highlightQuery: | |
325 textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) | |
326 | |
327 # fetch the page | |
328 pagexml = self.getServerData("page-fragment.xql",textParam) | |
329 dom = ET.fromstring(pagexml) | |
330 # extract additional info | |
331 self.processPageInfo(dom, docinfo, pageinfo) | |
332 # page content is in <div class="pageContent"> | |
333 pagediv = None | |
334 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] | |
335 alldivs = dom.findall(".//div") | |
336 for div in alldivs: | |
337 dc = div.get('class') | |
338 # page content div | |
339 if dc == 'pageContent': | |
340 pagediv = div | |
353 break | 341 break |
354 | |
355 # ROC: why? | |
356 # else: | |
357 # #no full text -- init to 0 | |
358 # docinfo['pageNumberOrig'] = 0 | |
359 # docinfo['countFigureEntries'] = 0 | |
360 # docinfo['countPlaces'] = 0 | |
361 # docinfo['countTocEntries'] = 0 | |
362 # docinfo['numPages'] = 0 | |
363 # docinfo['pageNumberOrigNorm'] = 0 | |
364 # #return docinfo | |
365 | 342 |
366 # plain text mode | 343 # plain text mode |
367 if mode == "text": | 344 if mode == "text": |
368 #pagedivs = dom.xpath("/div") | |
369 if pagediv: | 345 if pagediv: |
370 links = pagediv.findall(".//a") | 346 links = pagediv.findall(".//a") |
371 for l in links: | 347 for l in links: |
372 href = l.get('href') | 348 href = l.get('href') |
373 if href and href.startswith('#note-'): | 349 if href and href.startswith('#note-'): |
374 href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) | 350 href = href.replace('#note-',"?mode=%s&url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn)) |
375 l.set('href', href) | 351 l.set('href', href) |
376 logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) | 352 |
377 return serialize(pagediv) | 353 return serialize(pagediv) |
378 | 354 |
379 if mode == "xml": | |
380 if pagediv: | |
381 return serialize(pagediv) | |
382 | |
383 if mode == "pureXml": | |
384 if pagediv: | |
385 return serialize(pagediv) | |
386 | |
387 if mode == "gis": | |
388 if pagediv: | |
389 # check all a-tags | |
390 links = pagediv.findall(".//a") | |
391 for l in links: | |
392 href = l.get('href') | |
393 if href: | |
394 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): | |
395 l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) | |
396 l.set('target', '_blank') | |
397 | |
398 return serialize(pagenode) | |
399 | |
400 # text-with-links mode | 355 # text-with-links mode |
401 if mode == "text_dict": | 356 elif mode == "text_dict": |
402 if pagediv: | 357 if pagediv: |
403 # check all a-tags | 358 # check all a-tags |
404 links = pagediv.findall(".//a") | 359 links = pagediv.findall(".//a") |
405 for l in links: | 360 for l in links: |
406 href = l.get('href') | 361 href = l.get('href') |
421 l.set('target', '_blank') | 376 l.set('target', '_blank') |
422 l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") | 377 l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") |
423 l.set('ondblclick', 'popupWin.focus();') | 378 l.set('ondblclick', 'popupWin.focus();') |
424 | 379 |
425 if href.startswith('#note-'): | 380 if href.startswith('#note-'): |
426 l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) | 381 l.set('href', href.replace('#note-',"?mode=%s&url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(urlmode,url,tocMode,tocPN,pn))) |
427 | 382 |
428 return serialize(pagediv) | 383 return serialize(pagediv) |
429 | 384 |
385 # xml mode | |
386 elif mode == "xml": | |
387 if pagediv: | |
388 return serialize(pagediv) | |
389 | |
390 # pureXml mode | |
391 elif mode == "pureXml": | |
392 if pagediv: | |
393 return serialize(pagediv) | |
394 | |
395 # gis mode | |
396 elif mode == "gis": | |
397 name = docinfo['name'] | |
398 if pagediv: | |
399 # check all a-tags | |
400 links = pagediv.findall(".//a") | |
401 for l in links: | |
402 href = l.get('href') | |
403 if href: | |
404 if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): | |
405 l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) | |
406 l.set('target', '_blank') | |
407 | |
408 return serialize(pagediv) | |
409 | |
430 return "no text here" | 410 return "no text here" |
431 | 411 |
412 # WTF: is this needed? | |
432 def getOrigPages(self, docinfo=None, pageinfo=None): | 413 def getOrigPages(self, docinfo=None, pageinfo=None): |
433 docpath = docinfo['textURLPath'] | 414 logging.debug("CALLED: getOrigPages!") |
434 pn =pageinfo['current'] | 415 if not pageinfo.has_key('pageNumberOrig'): |
435 selfurl = self.absolute_url() | 416 logging.warning("getOrigPages: not in pageinfo!") |
436 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) | 417 return None |
437 dom = Parse(pagexml) | 418 |
438 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") | 419 return pageinfo['pageNumberOrig'] |
439 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): | 420 |
440 if len(pagedivs)>0: | 421 # WTF: is this needed? |
441 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) | |
442 return docinfo['pageNumberOrig'] | |
443 | |
444 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): | 422 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): |
445 docpath = docinfo['textURLPath'] | 423 logging.debug("CALLED: getOrigPagesNorm!") |
446 pn =pageinfo['current'] | 424 if not pageinfo.has_key('pageNumberOrigNorm'): |
447 selfurl = self.absolute_url() | 425 logging.warning("getOrigPagesNorm: not in pageinfo!") |
448 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) | 426 return None |
449 dom = Parse(pagexml) | 427 |
450 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") | 428 return pageinfo['pageNumberOrigNorm'] |
451 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): | |
452 if len(pagedivs)>0: | |
453 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) | |
454 return docinfo['pageNumberOrigNorm'] | |
455 | |
456 | 429 |
430 # TODO: should be getWordInfo | |
457 def getTranslate(self, word=None, language=None): | 431 def getTranslate(self, word=None, language=None): |
458 """translate into another languages""" | 432 """translate into another languages""" |
459 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") | 433 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") |
460 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) | |
461 return data | 434 return data |
462 | 435 |
436 # WTF: what does this do? | |
463 def getLemma(self, lemma=None, language=None): | 437 def getLemma(self, lemma=None, language=None): |
464 """simular words lemma """ | 438 """simular words lemma """ |
465 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") | 439 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") |
466 return data | 440 return data |
467 | 441 |
442 # WTF: what does this do? | |
468 def getLemmaQuery(self, query=None, language=None): | 443 def getLemmaQuery(self, query=None, language=None): |
469 """simular words lemma """ | 444 """simular words lemma """ |
470 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") | 445 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") |
471 return data | 446 return data |
472 | 447 |
448 # WTF: what does this do? | |
473 def getLex(self, query=None, language=None): | 449 def getLex(self, query=None, language=None): |
474 #simular words lemma | 450 #simular words lemma |
475 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) | 451 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) |
476 return data | 452 return data |
477 | 453 |
454 # WTF: what does this do? | |
478 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): | 455 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): |
479 #number of | 456 #number of |
480 docpath = docinfo['textURLPath'] | 457 docpath = docinfo['textURLPath'] |
481 pagesize = pageinfo['queryPageSize'] | 458 pagesize = pageinfo['queryPageSize'] |
482 pn = pageinfo['searchPN'] | 459 pn = pageinfo['searchPN'] |
491 tocSearch = int(getTextFromNode(numdivs[0])) | 468 tocSearch = int(getTextFromNode(numdivs[0])) |
492 tc=int((tocSearch/10)+1) | 469 tc=int((tocSearch/10)+1) |
493 return tc | 470 return tc |
494 | 471 |
495 def getToc(self, mode="text", docinfo=None): | 472 def getToc(self, mode="text", docinfo=None): |
496 """loads table of contents and stores in docinfo""" | 473 """loads table of contents and stores XML in docinfo""" |
474 logging.debug("getToc mode=%s"%mode) | |
497 if mode == "none": | 475 if mode == "none": |
498 return docinfo | 476 return docinfo |
477 | |
499 if 'tocSize_%s'%mode in docinfo: | 478 if 'tocSize_%s'%mode in docinfo: |
500 # cached toc | 479 # cached toc |
501 return docinfo | 480 return docinfo |
502 | 481 |
503 docpath = docinfo['textURLPath'] | 482 docpath = docinfo['textURLPath'] |
509 else: | 488 else: |
510 queryType = mode | 489 queryType = mode |
511 # number of entries in toc | 490 # number of entries in toc |
512 tocSize = 0 | 491 tocSize = 0 |
513 tocDiv = None | 492 tocDiv = None |
514 | 493 # fetch full toc |
515 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) | 494 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) |
516 | 495 dom = ET.fromstring(pagexml) |
517 # post-processing downloaded xml | 496 # page content is in <div class="queryResultPage"> |
518 pagedom = Parse(pagexml) | 497 pagediv = None |
519 # get number of entries | 498 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] |
520 numdivs = pagedom.xpath("//div[@class='queryResultHits']") | 499 alldivs = dom.findall("div") |
521 if len(numdivs) > 0: | 500 for div in alldivs: |
522 tocSize = int(getTextFromNode(numdivs[0])) | 501 dc = div.get('class') |
523 docinfo['tocSize_%s'%mode] = tocSize | 502 # page content div |
503 if dc == 'queryResultPage': | |
504 pagediv = div | |
505 | |
506 elif dc == 'queryResultHits': | |
507 docinfo['tocSize_%s'%mode] = intOr0(div.text) | |
508 | |
509 if pagediv: | |
510 # # split xml in chunks | |
511 # tocs = [] | |
512 # tocdivs = pagediv.findall('div') | |
513 # for p in zip(tocdivs[::2], tocdivs[1::2]): | |
514 # toc = serialize(p[0]) | |
515 # toc += serialize(p[1]) | |
516 # tocs.append(toc) | |
517 # logging.debug("pair: %s"%(toc)) | |
518 # store XML in docinfo | |
519 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') | |
520 | |
524 return docinfo | 521 return docinfo |
525 | 522 |
526 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): | 523 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): |
527 """returns single page from the table of contents""" | 524 """returns single page from the table of contents""" |
528 # TODO: this should use the cached TOC | 525 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) |
529 if mode == "text": | 526 if mode == "text": |
530 queryType = "toc" | 527 queryType = "toc" |
531 else: | 528 else: |
532 queryType = mode | 529 queryType = mode |
533 docpath = docinfo['textURLPath'] | 530 |
534 path = docinfo['textURLPath'] | 531 # check for cached TOC |
535 pagesize = pageinfo['tocPageSize'] | 532 if not docinfo.has_key('tocXML_%s'%mode): |
536 pn = pageinfo['tocPN'] | 533 self.getToc(mode=mode, docinfo=docinfo) |
534 | |
535 tocxml = docinfo.get('tocXML_%s'%mode, None) | |
536 if not tocxml: | |
537 logging.error("getTocPage: unable to find tocXML") | |
538 return "No ToC" | |
539 | |
540 pagesize = int(pageinfo['tocPageSize']) | |
537 url = docinfo['url'] | 541 url = docinfo['url'] |
538 selfurl = self.absolute_url() | 542 urlmode = docinfo['mode'] |
543 selfurl = docinfo['viewerUrl'] | |
539 viewMode= pageinfo['viewMode'] | 544 viewMode= pageinfo['viewMode'] |
540 characterNormalization = pageinfo ['characterNormalization'] | |
541 #optionToggle =pageinfo ['optionToggle'] | |
542 tocMode = pageinfo['tocMode'] | 545 tocMode = pageinfo['tocMode'] |
543 tocPN = pageinfo['tocPN'] | 546 tocPN = int(pageinfo['tocPN']) |
544 | 547 |
545 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) | 548 fulltoc = ET.fromstring(tocxml) |
546 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) | 549 |
547 text = page.replace('mode=image','mode=texttool') | 550 if fulltoc: |
548 return text | 551 # paginate |
552 #start = (pn - 1) * pagesize * 2 | |
553 #end = start + pagesize * 2 | |
554 #tocdivs = fulltoc[start:end] | |
555 tocdivs = fulltoc | |
556 | |
557 # check all a-tags | |
558 links = tocdivs.findall(".//a") | |
559 for l in links: | |
560 href = l.get('href') | |
561 if href: | |
562 # take pn from href | |
563 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) | |
564 if m is not None: | |
565 # and create new url | |
566 l.set('href', '%s?mode=%s&url=%s&viewMode=%s&pn=%s&tocMode=%s&tocPN=%s'%(selfurl, urlmode, url, viewMode, m.group(1), tocMode, tocPN)) | |
567 else: | |
568 logging.warning("getTocPage: Problem with link=%s"%href) | |
569 | |
570 return serialize(tocdivs) | |
571 | |
549 | 572 |
550 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): | 573 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): |
551 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): | |
552 """change settings""" | 574 """change settings""" |
553 self.title=title | 575 self.title=title |
554 self.timeout = timeout | 576 self.timeout = timeout |
555 self.serverUrl = serverUrl | 577 self.serverUrl = serverUrl |
556 if RESPONSE is not None: | 578 if RESPONSE is not None: |
567 """add zogiimage""" | 589 """add zogiimage""" |
568 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) | 590 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) |
569 self.Destination()._setObject(id, newObj) | 591 self.Destination()._setObject(id, newObj) |
570 if RESPONSE is not None: | 592 if RESPONSE is not None: |
571 RESPONSE.redirect('manage_main') | 593 RESPONSE.redirect('manage_main') |
594 | |
595 |