Context Navigation

source: documentViewer/MpdlXmlTextServer.py @ 511:551ca1641a5e

elementtree

Last change on this file since 511:551ca1641a5e was 511:551ca1641a5e, checked in by casties, 12 years ago
more cleanup. search really works now.
File size: 19.4 KB

Line
1	from OFS.SimpleItem import SimpleItem
2	from Products.PageTemplates.PageTemplateFile import PageTemplateFile
3
4	import xml.etree.ElementTree as ET
5
6	import re
7	import logging
8	import urllib
9	import urlparse
10	import base64
11
12	from SrvTxtUtils import getInt, getText, getHttpData
13
14	def serialize(node):
15	"""returns a string containing an XML snippet of node"""
16	s = ET.tostring(node, 'UTF-8')
17	# snip off XML declaration
18	if s.startswith('<?xml'):
19	i = s.find('?>')
20	return s[i+3:]
21
22	return s
23
24
25	class MpdlXmlTextServer(SimpleItem):
26	"""TextServer implementation for MPDL-XML eXist server"""
27	meta_type="MPDL-XML TextServer"
28
29	manage_options=(
30	{'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
31	)+SimpleItem.manage_options
32
33	manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
34
35	def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
36	"""constructor"""
37	self.id=id
38	self.title=title
39	self.timeout = timeout
40	if serverName is None:
41	self.serverUrl = serverUrl
42	else:
43	self.serverUrl = "http://%s/mpdl/interface/"%serverName
44
45	def getHttpData(self, url, data=None):
46	"""returns result from url+data HTTP request"""
47	return getHttpData(url,data,timeout=self.timeout)
48
49	def getServerData(self, method, data=None):
50	"""returns result from text server for method+data"""
51	url = self.serverUrl+method
52	return getHttpData(url,data,timeout=self.timeout)
53
54
55	def getPlacesOnPage(self, docinfo=None, pn=None):
56	"""Returns list of GIS places of page pn"""
57	docpath = docinfo.get('textURLPath',None)
58	if not docpath:
59	return None
60
61	places=[]
62	text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
63	dom = ET.fromstring(text)
64	result = dom.findall(".//resultPage/place")
65	for l in result:
66	id = l.get("id")
67	name = l.text
68	place = {'id': id, 'name': name}
69	places.append(place)
70
71	return places
72
73
74	def processPageInfo(self, dom, docinfo, pageinfo):
75	"""processes page info divs from dom and stores in docinfo and pageinfo"""
76	# assume first second level div is pageMeta
77	alldivs = dom.find("div")
78
79	if alldivs is None or alldivs.get('class', '') != 'pageMeta':
80	logging.error("processPageInfo: pageMeta div not found!")
81	return
82
83	for div in alldivs:
84	dc = div.get('class')
85
86	# pageNumberOrig
87	if dc == 'pageNumberOrig':
88	pageinfo['pageNumberOrig'] = div.text
89
90	# pageNumberOrigNorm
91	elif dc == 'pageNumberOrigNorm':
92	pageinfo['pageNumberOrigNorm'] = div.text
93
94	# pageHeaderTitle
95	elif dc == 'pageHeaderTitle':
96	pageinfo['pageHeaderTitle'] = div.text
97
98	# numFigureEntries
99	elif dc == 'countFigureEntries':
100	docinfo['numFigureEntries'] = getInt(div.text)
101
102	# numTocEntries
103	elif dc == 'countTocEntries':
104	# WTF: s1 = int(s)/30+1
105	docinfo['numTocEntries'] = getInt(div.text)
106
107	# numPlaces
108	elif dc == 'countPlaces':
109	docinfo['numPlaces'] = getInt(div.text)
110
111	# numTextPages
112	elif dc == 'countPages':
113	np = getInt(div.text)
114	if np > 0:
115	docinfo['numTextPages'] = np
116	if docinfo.get('numPages', 0) == 0:
117	# seems to be text-only - update page count
118	docinfo['numPages'] = np
119	#pageinfo['end'] = min(pageinfo['end'], np)
120	pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
121	if np % pageinfo['groupsize'] > 0:
122	pageinfo['numgroups'] += 1
123
124	#logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
125	return
126
127
128	def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
129	"""returns single page from fulltext"""
130
131	logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
132	# check for cached text -- but ideally this shouldn't be called twice
133	if pageinfo.has_key('textPage'):
134	logging.debug("getTextPage: using cached text")
135	return pageinfo['textPage']
136
137	docpath = docinfo['textURLPath']
138	# just checking
139	if pageinfo['current'] != pn:
140	logging.warning("getTextPage: current!=pn!")
141
142	# stuff for constructing full urls
143	selfurl = docinfo['viewerUrl']
144	textParams = {'document': docpath,
145	'pn': pn}
146	if 'characterNormalization' in pageinfo:
147	textParams['characterNormalization'] = pageinfo['characterNormalization']
148
149	if not mode:
150	# default is dict
151	mode = 'text'
152
153	modes = mode.split(',')
154	# check for multiple layers
155	if len(modes) > 1:
156	logging.debug("getTextPage: more than one mode=%s"%mode)
157
158	# search mode
159	if 'search' in modes:
160	# add highlighting
161	highlightQuery = pageinfo.get('highlightQuery', None)
162	if highlightQuery:
163	textParams['highlightQuery'] = highlightQuery
164	textParams['highlightElement'] = pageinfo.get('highlightElement', '')
165	textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
166
167	# ignore mode in the following
168	modes.remove('search')
169
170	# other modes don't combine
171	if 'dict' in modes:
172	# dict is called textPollux in the backend
173	textmode = 'textPollux'
174	elif len(modes) == 0:
175	# text is default mode
176	textmode = 'text'
177	else:
178	# just take first mode
179	textmode = modes[0]
180
181	textParams['mode'] = textmode
182
183	# fetch the page
184	pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
185	dom = ET.fromstring(pagexml)
186	# extract additional info
187	self.processPageInfo(dom, docinfo, pageinfo)
188	# page content is in <div class="pageContent">
189	pagediv = None
190	# ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
191	# so we look at the second level divs
192	alldivs = dom.findall("div")
193	for div in alldivs:
194	dc = div.get('class')
195	# page content div
196	if dc == 'pageContent':
197	pagediv = div
198	break
199
200	# plain text mode
201	if textmode == "text":
202	# get full url assuming documentViewer is parent
203	selfurl = self.getLink()
204	if pagediv is not None:
205	links = pagediv.findall(".//a")
206	for l in links:
207	href = l.get('href')
208	if href and href.startswith('#note-'):
209	href = href.replace('#note-',"%s#note-"%selfurl)
210	l.set('href', href)
211
212	return serialize(pagediv)
213
214	# text-with-links mode
215	elif textmode == "textPollux":
216	if pagediv is not None:
217	viewerurl = docinfo['viewerUrl']
218	selfurl = self.getLink()
219	# check all a-tags
220	links = pagediv.findall(".//a")
221	for l in links:
222	href = l.get('href')
223
224	if href:
225	# is link with href
226	linkurl = urlparse.urlparse(href)
227	#logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
228	if linkurl.path.endswith('GetDictionaryEntries'):
229	#TODO: replace wordInfo page
230	# is dictionary link - change href (keeping parameters)
231	#l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
232	# add target to open new page
233	l.set('target', '_blank')
234
235	# TODO: is this needed?
236	# if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
237	# selfurl = self.absolute_url()
238	# l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
239	# l.set('target', '_blank')
240	# l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
241	# l.set('ondblclick', 'popupWin.focus();')
242
243	if href.startswith('#note-'):
244	# note link
245	l.set('href', href.replace('#note-',"%s#note-"%selfurl))
246
247	return serialize(pagediv)
248
249	# xml mode
250	elif textmode == "xml":
251	if pagediv is not None:
252	return serialize(pagediv)
253
254	# pureXml mode
255	elif textmode == "pureXml":
256	if pagediv is not None:
257	return serialize(pagediv)
258
259	# gis mode
260	elif textmode == "gis":
261	if pagediv is not None:
262	# check all a-tags
263	links = pagediv.findall(".//a")
264	# add our URL as backlink
265	selfurl = self.getLink()
266	doc = base64.b64encode(selfurl)
267	for l in links:
268	href = l.get('href')
269	if href:
270	if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
271	l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
272	l.set('target', '_blank')
273
274	return serialize(pagediv)
275
276	return None
277
278
279	def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
280	"""loads list of search results and stores XML in docinfo"""
281
282	logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
283	if mode == "none":
284	return docinfo
285
286	cachedQuery = docinfo.get('cachedQuery', None)
287	if cachedQuery is not None:
288	# cached search result
289	if cachedQuery == '%s_%s'%(mode,query):
290	# same query
291	return docinfo
292
293	else:
294	# different query
295	del docinfo['resultSize']
296	del docinfo['resultXML']
297
298	# cache query
299	docinfo['cachedQuery'] = '%s_%s'%(mode,query)
300
301	# fetch full results
302	docpath = docinfo['textURLPath']
303	params = {'document': docpath,
304	'mode': 'text',
305	'queryType': mode,
306	'query': query,
307	'queryResultPageSize': 1000,
308	'queryResultPN': 1,
309	'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
310	pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
311	#pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
312	dom = ET.fromstring(pagexml)
313	# page content is in <div class="queryResultPage">
314	pagediv = None
315	# ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
316	alldivs = dom.findall("div")
317	for div in alldivs:
318	dc = div.get('class')
319	# page content div
320	if dc == 'queryResultPage':
321	pagediv = div
322
323	elif dc == 'queryResultHits':
324	docinfo['resultSize'] = getInt(div.text)
325
326	if pagediv is not None:
327	# store XML in docinfo
328	docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')
329
330	return docinfo
331
332
333	def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
334	"""returns single page from the table of contents"""
335	logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
336	# check for cached result
337	if not 'resultXML' in docinfo:
338	self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
339
340	resultxml = docinfo.get('resultXML', None)
341	if not resultxml:
342	logging.error("getResultPage: unable to find resultXML")
343	return "Error: no result!"
344
345	if size is None:
346	size = pageinfo.get('resultPageSize', 10)
347
348	if start is None:
349	start = (pn - 1) * size
350
351	fullresult = ET.fromstring(resultxml)
352
353	if fullresult is not None:
354	# paginate
355	first = start-1
356	len = size
357	del fullresult[:first]
358	del fullresult[len:]
359	tocdivs = fullresult
360
361	# check all a-tags
362	links = tocdivs.findall(".//a")
363	for l in links:
364	href = l.get('href')
365	if href:
366	# assume all links go to pages
367	linkUrl = urlparse.urlparse(href)
368	linkParams = urlparse.parse_qs(linkUrl.query)
369	# take some parameters
370	params = {'pn': linkParams['pn'],
371	'highlightQuery': linkParams.get('highlightQuery',''),
372	'highlightElement': linkParams.get('highlightElement',''),
373	'highlightElementPos': linkParams.get('highlightElementPos','')
374	}
375	url = self.getLink(params=params)
376	l.set('href', url)
377
378	return serialize(tocdivs)
379
380	return "ERROR: no results!"
381
382
383	def getToc(self, mode="text", docinfo=None):
384	"""loads table of contents and stores XML in docinfo"""
385	logging.debug("getToc mode=%s"%mode)
386	if mode == "none":
387	return docinfo
388
389	if 'tocSize_%s'%mode in docinfo:
390	# cached toc
391	return docinfo
392
393	docpath = docinfo['textURLPath']
394	# we need to set a result set size
395	pagesize = 1000
396	pn = 1
397	if mode == "text":
398	queryType = "toc"
399	else:
400	queryType = mode
401	# number of entries in toc
402	tocSize = 0
403	tocDiv = None
404	# fetch full toc
405	pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
406	dom = ET.fromstring(pagexml)
407	# page content is in <div class="queryResultPage">
408	pagediv = None
409	# ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
410	alldivs = dom.findall("div")
411	for div in alldivs:
412	dc = div.get('class')
413	# page content div
414	if dc == 'queryResultPage':
415	pagediv = div
416
417	elif dc == 'queryResultHits':
418	docinfo['tocSize_%s'%mode] = getInt(div.text)
419
420	if pagediv is not None:
421	# store XML in docinfo
422	docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
423
424	return docinfo
425
426	def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
427	"""returns single page from the table of contents"""
428	logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
429	if mode == "text":
430	queryType = "toc"
431	else:
432	queryType = mode
433
434	# check for cached TOC
435	if not docinfo.has_key('tocXML_%s'%mode):
436	self.getToc(mode=mode, docinfo=docinfo)
437
438	tocxml = docinfo.get('tocXML_%s'%mode, None)
439	if not tocxml:
440	logging.error("getTocPage: unable to find tocXML")
441	return "Error: no table of contents!"
442
443	if size is None:
444	size = pageinfo.get('tocPageSize', 30)
445
446	if start is None:
447	start = (pn - 1) * size
448
449	fulltoc = ET.fromstring(tocxml)
450
451	if fulltoc is not None:
452	# paginate
453	first = (start - 1) * 2
454	len = size * 2
455	del fulltoc[:first]
456	del fulltoc[len:]
457	tocdivs = fulltoc
458
459	# check all a-tags
460	links = tocdivs.findall(".//a")
461	for l in links:
462	href = l.get('href')
463	if href:
464	# take pn from href
465	m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
466	if m is not None:
467	# and create new url (assuming parent is documentViewer)
468	url = self.getLink('pn', m.group(1))
469	l.set('href', url)
470	else:
471	logging.warning("getTocPage: Problem with link=%s"%href)
472
473	# fix two-divs-per-row with containing div
474	newtoc = ET.Element('div', {'class':'queryResultPage'})
475	for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
476	e = ET.Element('div',{'class':'tocline'})
477	e.append(d1)
478	e.append(d2)
479	newtoc.append(e)
480
481	return serialize(newtoc)
482
483	return "ERROR: no table of contents!"
484
485
486	def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
487	"""change settings"""
488	self.title=title
489	self.timeout = timeout
490	self.serverUrl = serverUrl
491	if RESPONSE is not None:
492	RESPONSE.redirect('manage_main')
493
494	# management methods
495	def manage_addMpdlXmlTextServerForm(self):
496	"""Form for adding"""
497	pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
498	return pt()
499
500	def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
501	#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
502	"""add zogiimage"""
503	newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
504	self.Destination()._setObject(id, newObj)
505	if RESPONSE is not None:
506	RESPONSE.redirect('manage_main')
507
508

Note: See TracBrowser for help on using the repository browser.

Download in other formats: