Mercurial > hg > documentViewer
comparison MpdlXmlTextServer.py @ 513:67095296c95a
Merge from elementtree branch
92a6443a6f16ff25674d43814ec0d6c0a43a5e1a
author | casties |
---|---|
date | Tue, 28 Feb 2012 19:10:08 +0100 |
parents | 91daab0c219b 551ca1641a5e |
children | 7d7b639d7be7 |
comparison
equal
deleted
inserted
replaced
497:73fb73577961 | 513:67095296c95a |
---|---|
1 | |
2 from OFS.SimpleItem import SimpleItem | 1 from OFS.SimpleItem import SimpleItem |
3 from Products.PageTemplates.PageTemplateFile import PageTemplateFile | 2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
4 from Ft.Xml import EMPTY_NAMESPACE, Parse | 3 |
5 from Ft.Xml.Domlette import NonvalidatingReader | 4 import xml.etree.ElementTree as ET |
6 | 5 |
7 import md5 | 6 import re |
8 import sys | |
9 import logging | 7 import logging |
10 import urllib | 8 import urllib |
11 import documentViewer | 9 import urlparse |
12 from documentViewer import getTextFromNode, serializeNode | 10 import base64 |
11 | |
12 from SrvTxtUtils import getInt, getText, getHttpData | |
13 | |
14 def serialize(node): | |
15 """returns a string containing an XML snippet of node""" | |
16 s = ET.tostring(node, 'UTF-8') | |
17 # snip off XML declaration | |
18 if s.startswith('<?xml'): | |
19 i = s.find('?>') | |
20 return s[i+3:] | |
21 | |
22 return s | |
23 | |
13 | 24 |
14 class MpdlXmlTextServer(SimpleItem): | 25 class MpdlXmlTextServer(SimpleItem): |
15 """TextServer implementation for MPDL-XML eXist server""" | 26 """TextServer implementation for MPDL-XML eXist server""" |
16 meta_type="MPDL-XML TextServer" | 27 meta_type="MPDL-XML TextServer" |
17 | 28 |
19 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, | 30 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, |
20 )+SimpleItem.manage_options | 31 )+SimpleItem.manage_options |
21 | 32 |
22 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) | 33 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
23 | 34 |
24 def __init__(self,id,title="",serverUrl="http://mpdl-test.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): | 35 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
25 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): | |
26 | |
27 """constructor""" | 36 """constructor""" |
28 self.id=id | 37 self.id=id |
29 self.title=title | 38 self.title=title |
30 self.timeout = timeout | 39 self.timeout = timeout |
31 if serverName is None: | 40 if serverName is None: |
33 else: | 42 else: |
34 self.serverUrl = "http://%s/mpdl/interface/"%serverName | 43 self.serverUrl = "http://%s/mpdl/interface/"%serverName |
35 | 44 |
36 def getHttpData(self, url, data=None): | 45 def getHttpData(self, url, data=None): |
37 """returns result from url+data HTTP request""" | 46 """returns result from url+data HTTP request""" |
38 return documentViewer.getHttpData(url,data,timeout=self.timeout) | 47 return getHttpData(url,data,timeout=self.timeout) |
39 | 48 |
40 def getServerData(self, method, data=None): | 49 def getServerData(self, method, data=None): |
41 """returns result from text server for method+data""" | 50 """returns result from text server for method+data""" |
42 url = self.serverUrl+method | 51 url = self.serverUrl+method |
43 return documentViewer.getHttpData(url,data,timeout=self.timeout) | 52 return getHttpData(url,data,timeout=self.timeout) |
44 | 53 |
45 def getSearch(self, pageinfo=None, docinfo=None): | 54 |
46 """get search list""" | 55 def getPlacesOnPage(self, docinfo=None, pn=None): |
47 docpath = docinfo['textURLPath'] | 56 """Returns list of GIS places of page pn""" |
48 url = docinfo['url'] | |
49 pagesize = pageinfo['queryPageSize'] | |
50 pn = pageinfo.get('searchPN',1) | |
51 #sn = pageinfo['sn'] | |
52 s = pageinfo['s'] | |
53 highlightElementPos =pageinfo ['highlightElementPos'] | |
54 highlightElement = pageinfo ['highlightElement'] | |
55 | |
56 highlightQuery = pageinfo['highlightQuery'] | |
57 query =pageinfo['query'] | |
58 queryType =pageinfo['queryType'] | |
59 viewMode= pageinfo['viewMode'] | |
60 tocMode = pageinfo['tocMode'] | |
61 characterNormalization = pageinfo['characterNormalization'] | |
62 #optionToggle = pageinfo['optionToggle'] | |
63 tocPN = pageinfo['tocPN'] | |
64 selfurl = self.absolute_url() | |
65 data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) | |
66 #data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) | |
67 pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) | |
68 pagedom = Parse(pagexml) | |
69 | |
70 """ | |
71 pagedivs = pagedom.xpath("//div[@class='queryResultHits']") | |
72 if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")): | |
73 if len(pagedivs)>0: | |
74 docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0])) | |
75 s = getTextFromNode(pagedivs[0]) | |
76 s1 = int(s)/10+1 | |
77 try: | |
78 docinfo['queryResultHits'] = int(s1) | |
79 logging.debug("SEARCH ENTRIES: %s"%(s1)) | |
80 except: | |
81 docinfo['queryResultHits'] = 0 | |
82 """ | |
83 if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): | |
84 pagedivs = pagedom.xpath("//div[@class='queryResultPage']") | |
85 if len(pagedivs)>0: | |
86 pagenode=pagedivs[0] | |
87 links=pagenode.xpath("//a") | |
88 for l in links: | |
89 hrefNode = l.getAttributeNodeNS(None, u"href") | |
90 if hrefNode: | |
91 href = hrefNode.nodeValue | |
92 if href.startswith('page-fragment.xql'): | |
93 selfurl = self.absolute_url() | |
94 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) | |
95 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) | |
96 #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) | |
97 return serializeNode(pagenode) | |
98 if (queryType=="fulltextMorph"): | |
99 pagedivs = pagedom.xpath("//div[@class='queryResult']") | |
100 if len(pagedivs)>0: | |
101 pagenode=pagedivs[0] | |
102 links=pagenode.xpath("//a") | |
103 for l in links: | |
104 hrefNode = l.getAttributeNodeNS(None, u"href") | |
105 if hrefNode: | |
106 href = hrefNode.nodeValue | |
107 if href.startswith('page-fragment.xql'): | |
108 selfurl = self.absolute_url() | |
109 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) | |
110 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) | |
111 if href.startswith('../lt/lemma.xql'): | |
112 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) | |
113 l.setAttributeNS(None, 'target', '_blank') | |
114 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") | |
115 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') | |
116 pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") | |
117 return serializeNode(pagenode) | |
118 if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): | |
119 pagedivs= pagedom.xpath("//div[@class='queryResultPage']") | |
120 if len(pagedivs)>0: | |
121 pagenode=pagedivs[0] | |
122 links=pagenode.xpath("//a") | |
123 for l in links: | |
124 hrefNode = l.getAttributeNodeNS(None, u"href") | |
125 if hrefNode: | |
126 href = hrefNode.nodeValue | |
127 hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) | |
128 if href.startswith('../lt/lex.xql'): | |
129 hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) | |
130 l.setAttributeNS(None, 'target', '_blank') | |
131 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") | |
132 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') | |
133 if href.startswith('../lt/lemma.xql'): | |
134 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) | |
135 l.setAttributeNS(None, 'target', '_blank') | |
136 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") | |
137 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') | |
138 return serializeNode(pagenode) | |
139 return "no text here" | |
140 | |
141 def getGisPlaces(self, docinfo=None, pageinfo=None): | |
142 """ Show all Gis Places of whole Page""" | |
143 xpath='//place' | |
144 docpath = docinfo.get('textURLPath',None) | 57 docpath = docinfo.get('textURLPath',None) |
145 if not docpath: | 58 if not docpath: |
146 return None | 59 return None |
147 | 60 |
148 url = docinfo['url'] | 61 places=[] |
149 selfurl = self.absolute_url() | 62 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) |
150 pn = pageinfo['current'] | 63 dom = ET.fromstring(text) |
151 hrefList=[] | 64 result = dom.findall(".//resultPage/place") |
152 myList= "" | |
153 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) | |
154 dom = Parse(text) | |
155 result = dom.xpath("//result/resultPage/place") | |
156 for l in result: | 65 for l in result: |
157 hrefNode= l.getAttributeNodeNS(None, u"id") | 66 id = l.get("id") |
158 href= hrefNode.nodeValue | 67 name = l.text |
159 hrefList.append(href) | 68 place = {'id': id, 'name': name} |
160 myList = ",".join(hrefList) | 69 places.append(place) |
161 #logging.debug("getGisPlaces :%s"%(myList)) | 70 |
162 return myList | 71 return places |
163 | 72 |
164 def getAllGisPlaces (self, docinfo=None, pageinfo=None): | |
165 """Show all Gis Places of whole Book """ | |
166 xpath ='//echo:place' | |
167 docpath =docinfo['textURLPath'] | |
168 url = docinfo['url'] | |
169 selfurl =self.absolute_url() | |
170 pn =pageinfo['current'] | |
171 hrefList=[] | |
172 myList="" | |
173 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) | |
174 dom =Parse(text) | |
175 result = dom.xpath("//result/resultPage/place") | |
176 | |
177 for l in result: | |
178 hrefNode = l.getAttributeNodeNS(None, u"id") | |
179 href= hrefNode.nodeValue | |
180 hrefList.append(href) | |
181 myList = ",".join(hrefList) | |
182 #logging.debug("getALLGisPlaces :%s"%(myList)) | |
183 return myList | |
184 | 73 |
74 def processPageInfo(self, dom, docinfo, pageinfo): | |
75 """processes page info divs from dom and stores in docinfo and pageinfo""" | |
76 # assume first second level div is pageMeta | |
77 alldivs = dom.find("div") | |
78 | |
79 if alldivs is None or alldivs.get('class', '') != 'pageMeta': | |
80 logging.error("processPageInfo: pageMeta div not found!") | |
81 return | |
82 | |
83 for div in alldivs: | |
84 dc = div.get('class') | |
85 | |
86 # pageNumberOrig | |
87 if dc == 'pageNumberOrig': | |
88 pageinfo['pageNumberOrig'] = div.text | |
89 | |
90 # pageNumberOrigNorm | |
91 elif dc == 'pageNumberOrigNorm': | |
92 pageinfo['pageNumberOrigNorm'] = div.text | |
93 | |
94 # pageHeaderTitle | |
95 elif dc == 'pageHeaderTitle': | |
96 pageinfo['pageHeaderTitle'] = div.text | |
97 | |
98 # numFigureEntries | |
99 elif dc == 'countFigureEntries': | |
100 docinfo['numFigureEntries'] = getInt(div.text) | |
101 | |
102 # numTocEntries | |
103 elif dc == 'countTocEntries': | |
104 # WTF: s1 = int(s)/30+1 | |
105 docinfo['numTocEntries'] = getInt(div.text) | |
106 | |
107 # numPlaces | |
108 elif dc == 'countPlaces': | |
109 docinfo['numPlaces'] = getInt(div.text) | |
110 | |
111 # numTextPages | |
112 elif dc == 'countPages': | |
113 np = getInt(div.text) | |
114 if np > 0: | |
115 docinfo['numTextPages'] = np | |
116 if docinfo.get('numPages', 0) == 0: | |
117 # seems to be text-only - update page count | |
118 docinfo['numPages'] = np | |
119 #pageinfo['end'] = min(pageinfo['end'], np) | |
120 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) | |
121 if np % pageinfo['groupsize'] > 0: | |
122 pageinfo['numgroups'] += 1 | |
123 | |
124 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) | |
125 return | |
126 | |
185 | 127 |
186 def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): | 128 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): |
187 """returns single page from fulltext""" | 129 """returns single page from fulltext""" |
130 | |
131 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) | |
132 # check for cached text -- but ideally this shouldn't be called twice | |
133 if pageinfo.has_key('textPage'): | |
134 logging.debug("getTextPage: using cached text") | |
135 return pageinfo['textPage'] | |
136 | |
188 docpath = docinfo['textURLPath'] | 137 docpath = docinfo['textURLPath'] |
189 path = docinfo['textURLPath'] | 138 # just checking |
190 url = docinfo.get('url',None) | 139 if pageinfo['current'] != pn: |
191 name = docinfo.get('name',None) | 140 logging.warning("getTextPage: current!=pn!") |
192 pn =pageinfo['current'] | 141 |
193 #sn = pageinfo['sn'] | 142 # stuff for constructing full urls |
194 s = pageinfo['s'] | 143 selfurl = docinfo['viewerUrl'] |
195 highlightElementPos =pageinfo ['highlightElementPos'] | 144 textParams = {'document': docpath, |
196 highlightElement = pageinfo ['highlightElement'] | 145 'pn': pn} |
197 #optionToggle =pageinfo ['optionToggle'] | 146 if 'characterNormalization' in pageinfo: |
198 highlightQuery = pageinfo['highlightQuery'] | 147 textParams['characterNormalization'] = pageinfo['characterNormalization'] |
199 #mode = pageinfo ['viewMode'] | 148 |
200 tocMode = pageinfo['tocMode'] | 149 if not mode: |
201 xpointer = pageinfo['xpointer'] | 150 # default is dict |
202 characterNormalization=pageinfo['characterNormalization'] | 151 mode = 'text' |
203 tocPN = pageinfo['tocPN'] | 152 |
204 selfurl = self.absolute_url() | 153 modes = mode.split(',') |
205 | 154 # check for multiple layers |
206 if mode == "text_dict": | 155 if len(modes) > 1: |
207 textmode = "textPollux" | 156 logging.debug("getTextPage: more than one mode=%s"%mode) |
157 | |
158 # search mode | |
159 if 'search' in modes: | |
160 # add highlighting | |
161 highlightQuery = pageinfo.get('highlightQuery', None) | |
162 if highlightQuery: | |
163 textParams['highlightQuery'] = highlightQuery | |
164 textParams['highlightElement'] = pageinfo.get('highlightElement', '') | |
165 textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') | |
166 | |
167 # ignore mode in the following | |
168 modes.remove('search') | |
169 | |
170 # other modes don't combine | |
171 if 'dict' in modes: | |
172 # dict is called textPollux in the backend | |
173 textmode = 'textPollux' | |
174 elif len(modes) == 0: | |
175 # text is default mode | |
176 textmode = 'text' | |
208 else: | 177 else: |
209 textmode = mode | 178 # just take first mode |
210 | 179 textmode = modes[0] |
211 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s&options=withIdentifier"%(docpath,textmode,pn,characterNormalization, xpointer) | 180 |
212 if highlightQuery is not None: | 181 textParams['mode'] = textmode |
213 #textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) | 182 |
214 textParam +="&highlightQuery=%s&s=%s&highlightElement=%s&highlightElementPos=%s"%(urllib.quote(highlightQuery),s, highlightElement, highlightElementPos) | 183 # fetch the page |
215 | 184 pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) |
216 pagexml = self.getServerData("page-fragment.xql",textParam) | 185 dom = ET.fromstring(pagexml) |
217 dom = Parse(pagexml) | 186 # extract additional info |
218 #dom = NonvalidatingReader.parseStream(pagexml) | 187 self.processPageInfo(dom, docinfo, pageinfo) |
219 | 188 # page content is in <div class="pageContent"> |
220 #original Pages | 189 pagediv = None |
221 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") | 190 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] |
222 | 191 # so we look at the second level divs |
223 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): | 192 alldivs = dom.findall("div") |
224 if len(pagedivs)>0: | 193 for div in alldivs: |
225 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) | 194 dc = div.get('class') |
226 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) | 195 # page content div |
227 | 196 if dc == 'pageContent': |
228 #original Pages Norm | 197 pagediv = div |
229 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") | 198 break |
230 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): | |
231 if len(pagedivs)>0: | |
232 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) | |
233 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) | |
234 """ | |
235 #figureEntries | |
236 pagedivs = dom.xpath("//div[@class='countFigureEntries']") | |
237 if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): | |
238 if len(pagedivs)>0: | |
239 docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) | |
240 s = getTextFromNode(pagedivs[0]) | |
241 if s=='0': | |
242 try: | |
243 docinfo['countFigureEntries'] = int(s) | |
244 except: | |
245 docinfo['countFigureEntries'] = 0 | |
246 else: | |
247 s1 = int(s)/30+1 | |
248 try: | |
249 docinfo['countFigureEntries'] = int(s1) | |
250 except: | |
251 docinfo['countFigureEntries'] = 0 | |
252 | |
253 #allPlaces | |
254 pagedivs = dom.xpath("//div[@class='countPlaces']") | |
255 if pagedivs == dom.xpath("//div[@class='countPlaces']"): | |
256 if len(pagedivs)>0: | |
257 docinfo['countPlaces']= getTextFromNode(pagedivs[0]) | |
258 s = getTextFromNode(pagedivs[0]) | |
259 try: | |
260 docinfo['countPlaces'] = int(s) | |
261 except: | |
262 docinfo['countPlaces'] = 0 | |
263 | |
264 #tocEntries | |
265 pagedivs = dom.xpath("//div[@class='countTocEntries']") | |
266 if pagedivs == dom.xpath("//div[@class='countTocEntries']"): | |
267 if len(pagedivs)>0: | |
268 docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) | |
269 s = getTextFromNode(pagedivs[0]) | |
270 if s=='0': | |
271 try: | |
272 docinfo['countTocEntries'] = int(s) | |
273 except: | |
274 docinfo['countTocEntries'] = 0 | |
275 else: | |
276 s1 = int(s)/30+1 | |
277 try: | |
278 docinfo['countTocEntries'] = int(s1) | |
279 except: | |
280 docinfo['countTocEntries'] = 0 | |
281 | |
282 #numTextPages | |
283 pagedivs = dom.xpath("//div[@class='countPages']") | |
284 if pagedivs == dom.xpath("//div[@class='countPages']"): | |
285 if len(pagedivs)>0: | |
286 docinfo['numPages'] = getTextFromNode(pagedivs[0]) | |
287 s = getTextFromNode(pagedivs[0]) | |
288 | |
289 try: | |
290 docinfo['numPages'] = int(s) | |
291 #logging.debug("PAGE NUMBER: %s"%(s)) | |
292 | |
293 np = docinfo['numPages'] | |
294 pageinfo['end'] = min(pageinfo['end'], np) | |
295 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) | |
296 if np % pageinfo['groupsize'] > 0: | |
297 pageinfo['numgroups'] += 1 | |
298 except: | |
299 docinfo['numPages'] = 0 | |
300 | |
301 else: | |
302 #no full text -- init to 0 | |
303 docinfo['pageNumberOrig'] = 0 | |
304 docinfo['countFigureEntries'] = 0 | |
305 docinfo['countPlaces'] = 0 | |
306 docinfo['countTocEntries'] = 0 | |
307 docinfo['numPages'] = 0 | |
308 docinfo['pageNumberOrigNorm'] = 0 | |
309 #return docinfo | |
310 | 199 |
311 # plain text mode | 200 # plain text mode |
312 if mode == "text": | 201 if textmode == "text": |
313 # first div contains text | 202 # get full url assuming documentViewer is parent |
314 pagedivs = dom.xpath("/div") | 203 selfurl = self.getLink() |
315 if len(pagedivs) > 0: | 204 if pagediv is not None: |
316 pagenode = pagedivs[0] | 205 links = pagediv.findall(".//a") |
317 links = pagenode.xpath("//a") | |
318 for l in links: | 206 for l in links: |
319 hrefNode = l.getAttributeNodeNS(None, u"href") | 207 href = l.get('href') |
320 if hrefNode: | 208 if href and href.startswith('#note-'): |
321 href= hrefNode.nodeValue | 209 href = href.replace('#note-',"%s#note-"%selfurl) |
322 if href.startswith('#note-'): | 210 l.set('href', href) |
323 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) | 211 |
324 #if href.startswith(): | 212 return serialize(pagediv) |
325 return serializeNode(pagenode) | 213 |
326 if mode == "xml": | 214 # text-with-links mode |
327 # first div contains text | 215 elif textmode == "textPollux": |
328 pagedivs = dom.xpath("/div") | 216 if pagediv is not None: |
329 if len(pagedivs) > 0: | 217 viewerurl = docinfo['viewerUrl'] |
330 pagenode = pagedivs[0] | 218 selfurl = self.getLink() |
331 return serializeNode(pagenode) | 219 # check all a-tags |
332 if mode == "gis": | 220 links = pagediv.findall(".//a") |
333 # first div contains text | 221 for l in links: |
334 pagedivs = dom.xpath("/div") | 222 href = l.get('href') |
335 if len(pagedivs) > 0: | |
336 pagenode = pagedivs[0] | |
337 links =pagenode.xpath("//a") | |
338 for l in links: | |
339 hrefNode =l.getAttributeNodeNS(None, u"href") | |
340 if hrefNode: | |
341 href=hrefNode.nodeValue | |
342 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): | |
343 hrefNode.nodeValue =href.replace('db/REST/db/chgis/mpdl','db/RESTdb/db/mpdl/%s'%name) | |
344 l.setAttributeNS(None, 'target', '_blank') | |
345 return serializeNode(pagenode) | |
346 | 223 |
347 if mode == "pureXml": | 224 if href: |
348 # first div contains text | |
349 pagedivs = dom.xpath("/div") | |
350 if len(pagedivs) > 0: | |
351 pagenode = pagedivs[0] | |
352 return serializeNode(pagenode) | |
353 # text-with-links mode | |
354 if mode == "text_dict": | |
355 # first div contains text | |
356 #mode = pageinfo ['viewMode'] | |
357 pagedivs = dom.xpath("/div") | |
358 if len(pagedivs) > 0: | |
359 pagenode = pagedivs[0] | |
360 # check all a-tags | |
361 links = pagenode.xpath("//a") | |
362 | |
363 for l in links: | |
364 hrefNode = l.getAttributeNodeNS(None, u"href") | |
365 | |
366 if hrefNode: | |
367 # is link with href | 225 # is link with href |
368 href = hrefNode.nodeValue | 226 linkurl = urlparse.urlparse(href) |
369 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): | 227 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) |
370 # is pollux link | 228 if linkurl.path.endswith('GetDictionaryEntries'): |
371 selfurl = self.absolute_url() | 229 #TODO: replace wordInfo page |
372 # change href | 230 # is dictionary link - change href (keeping parameters) |
373 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) | 231 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) |
374 # add target | 232 # add target to open new page |
375 l.setAttributeNS(None, 'target', '_blank') | 233 l.set('target', '_blank') |
376 #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") | |
377 #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") | |
378 #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") | |
379 | 234 |
380 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): | 235 # TODO: is this needed? |
381 selfurl = self.absolute_url() | 236 # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): |
382 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) | 237 # selfurl = self.absolute_url() |
383 l.setAttributeNS(None, 'target', '_blank') | 238 # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) |
384 l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=300,height=400,top=180, left=700, toolbar=no, scrollbars=1'); return false;") | 239 # l.set('target', '_blank') |
385 l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') | 240 # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") |
241 # l.set('ondblclick', 'popupWin.focus();') | |
386 | 242 |
387 if href.startswith('#note-'): | 243 if href.startswith('#note-'): |
388 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) | 244 # note link |
245 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) | |
389 | 246 |
390 return serializeNode(pagenode) | 247 return serialize(pagediv) |
391 return "no text here" | 248 |
392 | 249 # xml mode |
393 def getOrigPages(self, docinfo=None, pageinfo=None): | 250 elif textmode == "xml": |
251 if pagediv is not None: | |
252 return serialize(pagediv) | |
253 | |
254 # pureXml mode | |
255 elif textmode == "pureXml": | |
256 if pagediv is not None: | |
257 return serialize(pagediv) | |
258 | |
259 # gis mode | |
260 elif textmode == "gis": | |
261 if pagediv is not None: | |
262 # check all a-tags | |
263 links = pagediv.findall(".//a") | |
264 # add our URL as backlink | |
265 selfurl = self.getLink() | |
266 doc = base64.b64encode(selfurl) | |
267 for l in links: | |
268 href = l.get('href') | |
269 if href: | |
270 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): | |
271 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) | |
272 l.set('target', '_blank') | |
273 | |
274 return serialize(pagediv) | |
275 | |
276 return None | |
277 | |
278 | |
279 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): | |
280 """loads list of search results and stores XML in docinfo""" | |
281 | |
282 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) | |
283 if mode == "none": | |
284 return docinfo | |
285 | |
286 cachedQuery = docinfo.get('cachedQuery', None) | |
287 if cachedQuery is not None: | |
288 # cached search result | |
289 if cachedQuery == '%s_%s'%(mode,query): | |
290 # same query | |
291 return docinfo | |
292 | |
293 else: | |
294 # different query | |
295 del docinfo['resultSize'] | |
296 del docinfo['resultXML'] | |
297 | |
298 # cache query | |
299 docinfo['cachedQuery'] = '%s_%s'%(mode,query) | |
300 | |
301 # fetch full results | |
394 docpath = docinfo['textURLPath'] | 302 docpath = docinfo['textURLPath'] |
395 pn =pageinfo['current'] | 303 params = {'document': docpath, |
396 selfurl = self.absolute_url() | 304 'mode': 'text', |
397 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) | 305 'queryType': mode, |
398 dom = Parse(pagexml) | 306 'query': query, |
399 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") | 307 'queryResultPageSize': 1000, |
400 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): | 308 'queryResultPN': 1, |
401 if len(pagedivs)>0: | 309 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} |
402 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) | 310 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) |
403 return docinfo['pageNumberOrig'] | 311 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) |
404 | 312 dom = ET.fromstring(pagexml) |
405 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): | 313 # page content is in <div class="queryResultPage"> |
406 docpath = docinfo['textURLPath'] | 314 pagediv = None |
407 pn =pageinfo['current'] | 315 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] |
408 selfurl = self.absolute_url() | 316 alldivs = dom.findall("div") |
409 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) | 317 for div in alldivs: |
410 dom = Parse(pagexml) | 318 dc = div.get('class') |
411 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") | 319 # page content div |
412 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): | 320 if dc == 'queryResultPage': |
413 if len(pagedivs)>0: | 321 pagediv = div |
414 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) | 322 |
415 return docinfo['pageNumberOrigNorm'] | 323 elif dc == 'queryResultHits': |
416 | 324 docinfo['resultSize'] = getInt(div.text) |
417 | 325 |
418 def getTranslate(self, word=None, language=None, display=None): | 326 if pagediv is not None: |
419 """translate into another languages""" | 327 # store XML in docinfo |
420 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&display="+urllib.quote(display)+"&output=html") | 328 docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') |
421 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) | 329 |
422 return data | 330 return docinfo |
423 | 331 |
424 def getLemma(self, lemma=None, language=None): | 332 |
425 """simular words lemma """ | 333 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): |
426 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") | 334 """returns single page from the table of contents""" |
427 return data | 335 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) |
428 | 336 # check for cached result |
429 def getLemmaQuery(self, query=None, language=None): | 337 if not 'resultXML' in docinfo: |
430 """simular words lemma """ | 338 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) |
431 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") | 339 |
432 return data | 340 resultxml = docinfo.get('resultXML', None) |
433 | 341 if not resultxml: |
434 def getLex(self, query=None, language=None): | 342 logging.error("getResultPage: unable to find resultXML") |
435 #simular words lemma | 343 return "Error: no result!" |
436 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) | 344 |
437 return data | 345 if size is None: |
438 | 346 size = pageinfo.get('resultPageSize', 10) |
439 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): | 347 |
440 #number of | 348 if start is None: |
441 docpath = docinfo['textURLPath'] | 349 start = (pn - 1) * size |
442 pagesize = pageinfo['queryPageSize'] | 350 |
443 pn = pageinfo['searchPN'] | 351 fullresult = ET.fromstring(resultxml) |
444 query =pageinfo['query'] | 352 |
445 queryType =pageinfo['queryType'] | 353 if fullresult is not None: |
446 tocSearch = 0 | 354 # paginate |
447 tocDiv = None | 355 first = start-1 |
448 | 356 len = size |
449 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) | 357 del fullresult[:first] |
450 pagedom = Parse(pagexml) | 358 del fullresult[len:] |
451 numdivs = pagedom.xpath("//div[@class='queryResultHits']") | 359 tocdivs = fullresult |
452 tocSearch = int(getTextFromNode(numdivs[0])) | 360 |
453 tc=int((tocSearch/10)+1) | 361 # check all a-tags |
454 return tc | 362 links = tocdivs.findall(".//a") |
455 | 363 for l in links: |
364 href = l.get('href') | |
365 if href: | |
366 # assume all links go to pages | |
367 linkUrl = urlparse.urlparse(href) | |
368 linkParams = urlparse.parse_qs(linkUrl.query) | |
369 # take some parameters | |
370 params = {'pn': linkParams['pn'], | |
371 'highlightQuery': linkParams.get('highlightQuery',''), | |
372 'highlightElement': linkParams.get('highlightElement',''), | |
373 'highlightElementPos': linkParams.get('highlightElementPos','') | |
374 } | |
375 url = self.getLink(params=params) | |
376 l.set('href', url) | |
377 | |
378 return serialize(tocdivs) | |
379 | |
380 return "ERROR: no results!" | |
381 | |
382 | |
456 def getToc(self, mode="text", docinfo=None): | 383 def getToc(self, mode="text", docinfo=None): |
457 """loads table of contents and stores in docinfo""" | 384 """loads table of contents and stores XML in docinfo""" |
385 logging.debug("getToc mode=%s"%mode) | |
458 if mode == "none": | 386 if mode == "none": |
459 return docinfo | 387 return docinfo |
388 | |
460 if 'tocSize_%s'%mode in docinfo: | 389 if 'tocSize_%s'%mode in docinfo: |
461 # cached toc | 390 # cached toc |
462 return docinfo | 391 return docinfo |
463 | 392 |
464 docpath = docinfo['textURLPath'] | 393 docpath = docinfo['textURLPath'] |
470 else: | 399 else: |
471 queryType = mode | 400 queryType = mode |
472 # number of entries in toc | 401 # number of entries in toc |
473 tocSize = 0 | 402 tocSize = 0 |
474 tocDiv = None | 403 tocDiv = None |
475 | 404 # fetch full toc |
476 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) | 405 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) |
477 | 406 dom = ET.fromstring(pagexml) |
478 # post-processing downloaded xml | 407 # page content is in <div class="queryResultPage"> |
479 pagedom = Parse(pagexml) | 408 pagediv = None |
480 # get number of entries | 409 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] |
481 numdivs = pagedom.xpath("//div[@class='queryResultHits']") | 410 alldivs = dom.findall("div") |
482 if len(numdivs) > 0: | 411 for div in alldivs: |
483 tocSize = int(getTextFromNode(numdivs[0])) | 412 dc = div.get('class') |
484 docinfo['tocSize_%s'%mode] = tocSize | 413 # page content div |
414 if dc == 'queryResultPage': | |
415 pagediv = div | |
416 | |
417 elif dc == 'queryResultHits': | |
418 docinfo['tocSize_%s'%mode] = getInt(div.text) | |
419 | |
420 if pagediv is not None: | |
421 # store XML in docinfo | |
422 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') | |
423 | |
485 return docinfo | 424 return docinfo |
486 | 425 |
487 def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): | 426 def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None): |
488 """returns single page from the table of contents""" | 427 """returns single page from the table of contents""" |
489 # TODO: this should use the cached TOC | 428 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) |
490 if mode == "text": | 429 if mode == "text": |
491 queryType = "toc" | 430 queryType = "toc" |
492 else: | 431 else: |
493 queryType = mode | 432 queryType = mode |
494 docpath = docinfo['textURLPath'] | 433 |
495 path = docinfo['textURLPath'] | 434 # check for cached TOC |
496 pagesize = pageinfo['tocPageSize'] | 435 if not docinfo.has_key('tocXML_%s'%mode): |
497 pn = pageinfo['tocPN'] | 436 self.getToc(mode=mode, docinfo=docinfo) |
498 url = docinfo['url'] | 437 |
499 selfurl = self.absolute_url() | 438 tocxml = docinfo.get('tocXML_%s'%mode, None) |
500 viewMode= pageinfo['viewMode'] | 439 if not tocxml: |
501 characterNormalization = pageinfo ['characterNormalization'] | 440 logging.error("getTocPage: unable to find tocXML") |
502 #optionToggle =pageinfo ['optionToggle'] | 441 return "Error: no table of contents!" |
503 tocMode = pageinfo['tocMode'] | 442 |
504 tocPN = pageinfo['tocPN'] | 443 if size is None: |
505 | 444 size = pageinfo.get('tocPageSize', 30) |
506 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) | 445 |
507 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) | 446 if start is None: |
508 text = page.replace('mode=image','mode=texttool') | 447 start = (pn - 1) * size |
509 return text | 448 |
449 fulltoc = ET.fromstring(tocxml) | |
450 | |
451 if fulltoc is not None: | |
452 # paginate | |
453 first = (start - 1) * 2 | |
454 len = size * 2 | |
455 del fulltoc[:first] | |
456 del fulltoc[len:] | |
457 tocdivs = fulltoc | |
458 | |
459 # check all a-tags | |
460 links = tocdivs.findall(".//a") | |
461 for l in links: | |
462 href = l.get('href') | |
463 if href: | |
464 # take pn from href | |
465 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) | |
466 if m is not None: | |
467 # and create new url (assuming parent is documentViewer) | |
468 url = self.getLink('pn', m.group(1)) | |
469 l.set('href', url) | |
470 else: | |
471 logging.warning("getTocPage: Problem with link=%s"%href) | |
472 | |
473 # fix two-divs-per-row with containing div | |
474 newtoc = ET.Element('div', {'class':'queryResultPage'}) | |
475 for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): | |
476 e = ET.Element('div',{'class':'tocline'}) | |
477 e.append(d1) | |
478 e.append(d2) | |
479 newtoc.append(e) | |
480 | |
481 return serialize(newtoc) | |
482 | |
483 return "ERROR: no table of contents!" | |
484 | |
510 | 485 |
511 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): | 486 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): |
512 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): | |
513 """change settings""" | 487 """change settings""" |
514 self.title=title | 488 self.title=title |
515 self.timeout = timeout | 489 self.timeout = timeout |
516 self.serverUrl = serverUrl | 490 self.serverUrl = serverUrl |
517 if RESPONSE is not None: | 491 if RESPONSE is not None: |
528 """add zogiimage""" | 502 """add zogiimage""" |
529 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) | 503 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) |
530 self.Destination()._setObject(id, newObj) | 504 self.Destination()._setObject(id, newObj) |
531 if RESPONSE is not None: | 505 if RESPONSE is not None: |
532 RESPONSE.redirect('manage_main') | 506 RESPONSE.redirect('manage_main') |
507 | |
508 |