Changeset 513:67095296c95a in documentViewer for MpdlXmlTextServer.py
- Timestamp:
- Feb 28, 2012, 6:10:08 PM (12 years ago)
- Branch:
- default
- Parents:
- 497:73fb73577961 (diff), 512:92a6443a6f16 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent. - File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r451 r513 1 2 1 from OFS.SimpleItem import SimpleItem 3 2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile 4 from Ft.Xml import EMPTY_NAMESPACE, Parse 5 from Ft.Xml.Domlette import NonvalidatingReader 6 7 import md5 8 import sys 3 4 import xml.etree.ElementTree as ET 5 6 import re 9 7 import logging 10 8 import urllib 11 import documentViewer 12 from documentViewer import getTextFromNode, serializeNode 9 import urlparse 10 import base64 11 12 from SrvTxtUtils import getInt, getText, getHttpData 13 14 def serialize(node): 15 """returns a string containing an XML snippet of node""" 16 s = ET.tostring(node, 'UTF-8') 17 # snip off XML declaration 18 if s.startswith('<?xml'): 19 i = s.find('?>') 20 return s[i+3:] 21 22 return s 23 13 24 14 25 class MpdlXmlTextServer(SimpleItem): … … 22 33 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) 23 34 24 def __init__(self,id,title="",serverUrl="http://mpdl-test.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 25 #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): 26 35 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 27 36 """constructor""" 28 37 self.id=id … … 36 45 def getHttpData(self, url, data=None): 37 46 """returns result from url+data HTTP request""" 38 return documentViewer.getHttpData(url,data,timeout=self.timeout)47 return getHttpData(url,data,timeout=self.timeout) 39 48 40 49 def getServerData(self, method, data=None): 41 50 """returns result from text server for method+data""" 42 51 url = self.serverUrl+method 43 return documentViewer.getHttpData(url,data,timeout=self.timeout) 44 45 def getSearch(self, pageinfo=None, docinfo=None): 46 """get search list""" 47 docpath = docinfo['textURLPath'] 48 url = docinfo['url'] 49 pagesize = pageinfo['queryPageSize'] 50 pn = pageinfo.get('searchPN',1) 51 #sn = pageinfo['sn'] 52 s = pageinfo['s'] 53 highlightElementPos =pageinfo ['highlightElementPos'] 54 highlightElement = pageinfo ['highlightElement'] 55 56 highlightQuery = pageinfo['highlightQuery'] 57 query =pageinfo['query'] 58 queryType =pageinfo['queryType'] 59 viewMode= pageinfo['viewMode'] 60 tocMode = pageinfo['tocMode'] 61 characterNormalization = pageinfo['characterNormalization'] 62 #optionToggle = pageinfo['optionToggle'] 63 tocPN = pageinfo['tocPN'] 64 selfurl = self.absolute_url() 65 data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) 66 #data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) 67 pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) 68 pagedom = Parse(pagexml) 69 70 """ 71 pagedivs = pagedom.xpath("//div[@class='queryResultHits']") 72 if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")): 73 if len(pagedivs)>0: 74 docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0])) 75 s = getTextFromNode(pagedivs[0]) 76 s1 = int(s)/10+1 77 try: 78 docinfo['queryResultHits'] = int(s1) 79 logging.debug("SEARCH ENTRIES: %s"%(s1)) 80 except: 81 docinfo['queryResultHits'] = 0 82 """ 83 if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): 84 pagedivs = pagedom.xpath("//div[@class='queryResultPage']") 85 if len(pagedivs)>0: 86 pagenode=pagedivs[0] 87 links=pagenode.xpath("//a") 88 for l in links: 89 hrefNode = l.getAttributeNodeNS(None, u"href") 90 if hrefNode: 91 href = hrefNode.nodeValue 92 if href.startswith('page-fragment.xql'): 93 selfurl = self.absolute_url() 94 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) 95 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 96 #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) 97 return serializeNode(pagenode) 98 if (queryType=="fulltextMorph"): 99 pagedivs = pagedom.xpath("//div[@class='queryResult']") 100 if len(pagedivs)>0: 101 pagenode=pagedivs[0] 102 links=pagenode.xpath("//a") 103 for l in links: 104 hrefNode = l.getAttributeNodeNS(None, u"href") 105 if hrefNode: 106 href = hrefNode.nodeValue 107 if href.startswith('page-fragment.xql'): 108 selfurl = self.absolute_url() 109 pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) 110 hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 111 if href.startswith('../lt/lemma.xql'): 112 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) 113 l.setAttributeNS(None, 'target', '_blank') 114 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") 115 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') 116 pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") 117 return serializeNode(pagenode) 118 if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): 119 pagedivs= pagedom.xpath("//div[@class='queryResultPage']") 120 if len(pagedivs)>0: 121 pagenode=pagedivs[0] 122 links=pagenode.xpath("//a") 123 for l in links: 124 hrefNode = l.getAttributeNodeNS(None, u"href") 125 if hrefNode: 126 href = hrefNode.nodeValue 127 hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) 128 if href.startswith('../lt/lex.xql'): 129 hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) 130 l.setAttributeNS(None, 'target', '_blank') 131 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 132 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') 133 if href.startswith('../lt/lemma.xql'): 134 hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) 135 l.setAttributeNS(None, 'target', '_blank') 136 l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") 137 l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') 138 return serializeNode(pagenode) 139 return "no text here" 140 141 def getGisPlaces(self, docinfo=None, pageinfo=None): 142 """ Show all Gis Places of whole Page""" 143 xpath='//place' 52 return getHttpData(url,data,timeout=self.timeout) 53 54 55 def getPlacesOnPage(self, docinfo=None, pn=None): 56 """Returns list of GIS places of page pn""" 144 57 docpath = docinfo.get('textURLPath',None) 145 58 if not docpath: 146 59 return None 147 60 148 url = docinfo['url'] 149 selfurl = self.absolute_url() 150 pn = pageinfo['current'] 151 hrefList=[] 152 myList= "" 153 text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) 154 dom = Parse(text) 155 result = dom.xpath("//result/resultPage/place") 61 places=[] 62 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) 63 dom = ET.fromstring(text) 64 result = dom.findall(".//resultPage/place") 156 65 for l in result: 157 hrefNode= l.getAttributeNodeNS(None, u"id") 158 href= hrefNode.nodeValue 159 hrefList.append(href) 160 myList = ",".join(hrefList) 161 #logging.debug("getGisPlaces :%s"%(myList)) 162 return myList 163 164 def getAllGisPlaces (self, docinfo=None, pageinfo=None): 165 """Show all Gis Places of whole Book """ 166 xpath ='//echo:place' 167 docpath =docinfo['textURLPath'] 168 url = docinfo['url'] 169 selfurl =self.absolute_url() 170 pn =pageinfo['current'] 171 hrefList=[] 172 myList="" 173 text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) 174 dom =Parse(text) 175 result = dom.xpath("//result/resultPage/place") 176 177 for l in result: 178 hrefNode = l.getAttributeNodeNS(None, u"id") 179 href= hrefNode.nodeValue 180 hrefList.append(href) 181 myList = ",".join(hrefList) 182 #logging.debug("getALLGisPlaces :%s"%(myList)) 183 return myList 66 id = l.get("id") 67 name = l.text 68 place = {'id': id, 'name': name} 69 places.append(place) 70 71 return places 72 184 73 74 def processPageInfo(self, dom, docinfo, pageinfo): 75 """processes page info divs from dom and stores in docinfo and pageinfo""" 76 # assume first second level div is pageMeta 77 alldivs = dom.find("div") 78 79 if alldivs is None or alldivs.get('class', '') != 'pageMeta': 80 logging.error("processPageInfo: pageMeta div not found!") 81 return 82 83 for div in alldivs: 84 dc = div.get('class') 85 86 # pageNumberOrig 87 if dc == 'pageNumberOrig': 88 pageinfo['pageNumberOrig'] = div.text 89 90 # pageNumberOrigNorm 91 elif dc == 'pageNumberOrigNorm': 92 pageinfo['pageNumberOrigNorm'] = div.text 93 94 # pageHeaderTitle 95 elif dc == 'pageHeaderTitle': 96 pageinfo['pageHeaderTitle'] = div.text 97 98 # numFigureEntries 99 elif dc == 'countFigureEntries': 100 docinfo['numFigureEntries'] = getInt(div.text) 101 102 # numTocEntries 103 elif dc == 'countTocEntries': 104 # WTF: s1 = int(s)/30+1 105 docinfo['numTocEntries'] = getInt(div.text) 106 107 # numPlaces 108 elif dc == 'countPlaces': 109 docinfo['numPlaces'] = getInt(div.text) 110 111 # numTextPages 112 elif dc == 'countPages': 113 np = getInt(div.text) 114 if np > 0: 115 docinfo['numTextPages'] = np 116 if docinfo.get('numPages', 0) == 0: 117 # seems to be text-only - update page count 118 docinfo['numPages'] = np 119 #pageinfo['end'] = min(pageinfo['end'], np) 120 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) 121 if np % pageinfo['groupsize'] > 0: 122 pageinfo['numgroups'] += 1 123 124 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) 125 return 126 185 127 186 def getTextPage(self, mode="text _dict", pn=1, docinfo=None, pageinfo=None):128 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): 187 129 """returns single page from fulltext""" 130 131 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) 132 # check for cached text -- but ideally this shouldn't be called twice 133 if pageinfo.has_key('textPage'): 134 logging.debug("getTextPage: using cached text") 135 return pageinfo['textPage'] 136 188 137 docpath = docinfo['textURLPath'] 189 path = docinfo['textURLPath'] 190 url = docinfo.get('url',None) 191 name = docinfo.get('name',None) 192 pn =pageinfo['current'] 193 #sn = pageinfo['sn'] 194 s = pageinfo['s'] 195 highlightElementPos =pageinfo ['highlightElementPos'] 196 highlightElement = pageinfo ['highlightElement'] 197 #optionToggle =pageinfo ['optionToggle'] 198 highlightQuery = pageinfo['highlightQuery'] 199 #mode = pageinfo ['viewMode'] 200 tocMode = pageinfo['tocMode'] 201 xpointer = pageinfo['xpointer'] 202 characterNormalization=pageinfo['characterNormalization'] 203 tocPN = pageinfo['tocPN'] 204 selfurl = self.absolute_url() 205 206 if mode == "text_dict": 207 textmode = "textPollux" 138 # just checking 139 if pageinfo['current'] != pn: 140 logging.warning("getTextPage: current!=pn!") 141 142 # stuff for constructing full urls 143 selfurl = docinfo['viewerUrl'] 144 textParams = {'document': docpath, 145 'pn': pn} 146 if 'characterNormalization' in pageinfo: 147 textParams['characterNormalization'] = pageinfo['characterNormalization'] 148 149 if not mode: 150 # default is dict 151 mode = 'text' 152 153 modes = mode.split(',') 154 # check for multiple layers 155 if len(modes) > 1: 156 logging.debug("getTextPage: more than one mode=%s"%mode) 157 158 # search mode 159 if 'search' in modes: 160 # add highlighting 161 highlightQuery = pageinfo.get('highlightQuery', None) 162 if highlightQuery: 163 textParams['highlightQuery'] = highlightQuery 164 textParams['highlightElement'] = pageinfo.get('highlightElement', '') 165 textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') 166 167 # ignore mode in the following 168 modes.remove('search') 169 170 # other modes don't combine 171 if 'dict' in modes: 172 # dict is called textPollux in the backend 173 textmode = 'textPollux' 174 elif len(modes) == 0: 175 # text is default mode 176 textmode = 'text' 208 177 else: 209 textmode = mode 210 211 textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s&options=withIdentifier"%(docpath,textmode,pn,characterNormalization, xpointer) 212 if highlightQuery is not None: 213 #textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) 214 textParam +="&highlightQuery=%s&s=%s&highlightElement=%s&highlightElementPos=%s"%(urllib.quote(highlightQuery),s, highlightElement, highlightElementPos) 215 216 pagexml = self.getServerData("page-fragment.xql",textParam) 217 dom = Parse(pagexml) 218 #dom = NonvalidatingReader.parseStream(pagexml) 219 220 #original Pages 221 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") 222 223 """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): 224 if len(pagedivs)>0: 225 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) 226 logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) 227 228 #original Pages Norm 229 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") 230 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): 231 if len(pagedivs)>0: 232 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) 233 logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) 234 """ 235 #figureEntries 236 pagedivs = dom.xpath("//div[@class='countFigureEntries']") 237 if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): 238 if len(pagedivs)>0: 239 docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) 240 s = getTextFromNode(pagedivs[0]) 241 if s=='0': 242 try: 243 docinfo['countFigureEntries'] = int(s) 244 except: 245 docinfo['countFigureEntries'] = 0 246 else: 247 s1 = int(s)/30+1 248 try: 249 docinfo['countFigureEntries'] = int(s1) 250 except: 251 docinfo['countFigureEntries'] = 0 252 253 #allPlaces 254 pagedivs = dom.xpath("//div[@class='countPlaces']") 255 if pagedivs == dom.xpath("//div[@class='countPlaces']"): 256 if len(pagedivs)>0: 257 docinfo['countPlaces']= getTextFromNode(pagedivs[0]) 258 s = getTextFromNode(pagedivs[0]) 259 try: 260 docinfo['countPlaces'] = int(s) 261 except: 262 docinfo['countPlaces'] = 0 263 264 #tocEntries 265 pagedivs = dom.xpath("//div[@class='countTocEntries']") 266 if pagedivs == dom.xpath("//div[@class='countTocEntries']"): 267 if len(pagedivs)>0: 268 docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) 269 s = getTextFromNode(pagedivs[0]) 270 if s=='0': 271 try: 272 docinfo['countTocEntries'] = int(s) 273 except: 274 docinfo['countTocEntries'] = 0 275 else: 276 s1 = int(s)/30+1 277 try: 278 docinfo['countTocEntries'] = int(s1) 279 except: 280 docinfo['countTocEntries'] = 0 281 282 #numTextPages 283 pagedivs = dom.xpath("//div[@class='countPages']") 284 if pagedivs == dom.xpath("//div[@class='countPages']"): 285 if len(pagedivs)>0: 286 docinfo['numPages'] = getTextFromNode(pagedivs[0]) 287 s = getTextFromNode(pagedivs[0]) 288 289 try: 290 docinfo['numPages'] = int(s) 291 #logging.debug("PAGE NUMBER: %s"%(s)) 292 293 np = docinfo['numPages'] 294 pageinfo['end'] = min(pageinfo['end'], np) 295 pageinfo['numgroups'] = int(np / pageinfo['groupsize']) 296 if np % pageinfo['groupsize'] > 0: 297 pageinfo['numgroups'] += 1 298 except: 299 docinfo['numPages'] = 0 300 301 else: 302 #no full text -- init to 0 303 docinfo['pageNumberOrig'] = 0 304 docinfo['countFigureEntries'] = 0 305 docinfo['countPlaces'] = 0 306 docinfo['countTocEntries'] = 0 307 docinfo['numPages'] = 0 308 docinfo['pageNumberOrigNorm'] = 0 309 #return docinfo 178 # just take first mode 179 textmode = modes[0] 180 181 textParams['mode'] = textmode 182 183 # fetch the page 184 pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) 185 dom = ET.fromstring(pagexml) 186 # extract additional info 187 self.processPageInfo(dom, docinfo, pageinfo) 188 # page content is in <div class="pageContent"> 189 pagediv = None 190 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] 191 # so we look at the second level divs 192 alldivs = dom.findall("div") 193 for div in alldivs: 194 dc = div.get('class') 195 # page content div 196 if dc == 'pageContent': 197 pagediv = div 198 break 310 199 311 200 # plain text mode 312 if mode == "text": 313 # first div contains text 314 pagedivs = dom.xpath("/div") 315 if len(pagedivs) > 0: 316 pagenode = pagedivs[0] 317 links = pagenode.xpath("//a") 201 if textmode == "text": 202 # get full url assuming documentViewer is parent 203 selfurl = self.getLink() 204 if pagediv is not None: 205 links = pagediv.findall(".//a") 318 206 for l in links: 319 hrefNode = l.getAttributeNodeNS(None, u"href") 320 if hrefNode: 321 href= hrefNode.nodeValue 322 if href.startswith('#note-'): 323 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) 324 #if href.startswith(): 325 return serializeNode(pagenode) 326 if mode == "xml": 327 # first div contains text 328 pagedivs = dom.xpath("/div") 329 if len(pagedivs) > 0: 330 pagenode = pagedivs[0] 331 return serializeNode(pagenode) 332 if mode == "gis": 333 # first div contains text 334 pagedivs = dom.xpath("/div") 335 if len(pagedivs) > 0: 336 pagenode = pagedivs[0] 337 links =pagenode.xpath("//a") 338 for l in links: 339 hrefNode =l.getAttributeNodeNS(None, u"href") 340 if hrefNode: 341 href=hrefNode.nodeValue 342 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): 343 hrefNode.nodeValue =href.replace('db/REST/db/chgis/mpdl','db/RESTdb/db/mpdl/%s'%name) 344 l.setAttributeNS(None, 'target', '_blank') 345 return serializeNode(pagenode) 207 href = l.get('href') 208 if href and href.startswith('#note-'): 209 href = href.replace('#note-',"%s#note-"%selfurl) 210 l.set('href', href) 211 212 return serialize(pagediv) 213 214 # text-with-links mode 215 elif textmode == "textPollux": 216 if pagediv is not None: 217 viewerurl = docinfo['viewerUrl'] 218 selfurl = self.getLink() 219 # check all a-tags 220 links = pagediv.findall(".//a") 221 for l in links: 222 href = l.get('href') 346 223 347 if mode == "pureXml": 348 # first div contains text 349 pagedivs = dom.xpath("/div") 350 if len(pagedivs) > 0: 351 pagenode = pagedivs[0] 352 return serializeNode(pagenode) 353 # text-with-links mode 354 if mode == "text_dict": 355 # first div contains text 356 #mode = pageinfo ['viewMode'] 357 pagedivs = dom.xpath("/div") 358 if len(pagedivs) > 0: 359 pagenode = pagedivs[0] 360 # check all a-tags 361 links = pagenode.xpath("//a") 362 363 for l in links: 364 hrefNode = l.getAttributeNodeNS(None, u"href") 365 366 if hrefNode: 224 if href: 367 225 # is link with href 368 href = hrefNode.nodeValue 369 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): 370 # is pollux link 371 selfurl = self.absolute_url() 372 # change href 373 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) 374 # add target 375 l.setAttributeNS(None, 'target', '_blank') 376 #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") 377 #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") 378 #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") 226 linkurl = urlparse.urlparse(href) 227 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) 228 if linkurl.path.endswith('GetDictionaryEntries'): 229 #TODO: replace wordInfo page 230 # is dictionary link - change href (keeping parameters) 231 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) 232 # add target to open new page 233 l.set('target', '_blank') 379 234 380 if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): 381 selfurl = self.absolute_url() 382 hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) 383 l.setAttributeNS(None, 'target', '_blank') 384 l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=300,height=400,top=180, left=700, toolbar=no, scrollbars=1'); return false;") 385 l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') 235 # TODO: is this needed? 236 # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): 237 # selfurl = self.absolute_url() 238 # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) 239 # l.set('target', '_blank') 240 # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") 241 # l.set('ondblclick', 'popupWin.focus();') 386 242 387 243 if href.startswith('#note-'): 388 hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) 244 # note link 245 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) 389 246 390 return serializeNode(pagenode) 391 return "no text here" 392 393 def getOrigPages(self, docinfo=None, pageinfo=None): 247 return serialize(pagediv) 248 249 # xml mode 250 elif textmode == "xml": 251 if pagediv is not None: 252 return serialize(pagediv) 253 254 # pureXml mode 255 elif textmode == "pureXml": 256 if pagediv is not None: 257 return serialize(pagediv) 258 259 # gis mode 260 elif textmode == "gis": 261 if pagediv is not None: 262 # check all a-tags 263 links = pagediv.findall(".//a") 264 # add our URL as backlink 265 selfurl = self.getLink() 266 doc = base64.b64encode(selfurl) 267 for l in links: 268 href = l.get('href') 269 if href: 270 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): 271 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) 272 l.set('target', '_blank') 273 274 return serialize(pagediv) 275 276 return None 277 278 279 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): 280 """loads list of search results and stores XML in docinfo""" 281 282 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) 283 if mode == "none": 284 return docinfo 285 286 cachedQuery = docinfo.get('cachedQuery', None) 287 if cachedQuery is not None: 288 # cached search result 289 if cachedQuery == '%s_%s'%(mode,query): 290 # same query 291 return docinfo 292 293 else: 294 # different query 295 del docinfo['resultSize'] 296 del docinfo['resultXML'] 297 298 # cache query 299 docinfo['cachedQuery'] = '%s_%s'%(mode,query) 300 301 # fetch full results 394 302 docpath = docinfo['textURLPath'] 395 pn =pageinfo['current'] 396 selfurl = self.absolute_url() 397 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 398 dom = Parse(pagexml) 399 pagedivs = dom.xpath("//div[@class='pageNumberOrig']") 400 if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): 401 if len(pagedivs)>0: 402 docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) 403 return docinfo['pageNumberOrig'] 404 405 def getOrigPagesNorm(self, docinfo=None, pageinfo=None): 406 docpath = docinfo['textURLPath'] 407 pn =pageinfo['current'] 408 selfurl = self.absolute_url() 409 pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) 410 dom = Parse(pagexml) 411 pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") 412 if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): 413 if len(pagedivs)>0: 414 docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) 415 return docinfo['pageNumberOrigNorm'] 416 417 418 def getTranslate(self, word=None, language=None, display=None): 419 """translate into another languages""" 420 data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&display="+urllib.quote(display)+"&output=html") 421 #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) 422 return data 423 424 def getLemma(self, lemma=None, language=None): 425 """simular words lemma """ 426 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") 427 return data 428 429 def getLemmaQuery(self, query=None, language=None): 430 """simular words lemma """ 431 data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") 432 return data 433 434 def getLex(self, query=None, language=None): 435 #simular words lemma 436 data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) 437 return data 438 439 def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): 440 #number of 441 docpath = docinfo['textURLPath'] 442 pagesize = pageinfo['queryPageSize'] 443 pn = pageinfo['searchPN'] 444 query =pageinfo['query'] 445 queryType =pageinfo['queryType'] 446 tocSearch = 0 447 tocDiv = None 448 449 pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) 450 pagedom = Parse(pagexml) 451 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 452 tocSearch = int(getTextFromNode(numdivs[0])) 453 tc=int((tocSearch/10)+1) 454 return tc 455 303 params = {'document': docpath, 304 'mode': 'text', 305 'queryType': mode, 306 'query': query, 307 'queryResultPageSize': 1000, 308 'queryResultPN': 1, 309 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} 310 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) 311 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) 312 dom = ET.fromstring(pagexml) 313 # page content is in <div class="queryResultPage"> 314 pagediv = None 315 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] 316 alldivs = dom.findall("div") 317 for div in alldivs: 318 dc = div.get('class') 319 # page content div 320 if dc == 'queryResultPage': 321 pagediv = div 322 323 elif dc == 'queryResultHits': 324 docinfo['resultSize'] = getInt(div.text) 325 326 if pagediv is not None: 327 # store XML in docinfo 328 docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') 329 330 return docinfo 331 332 333 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): 334 """returns single page from the table of contents""" 335 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) 336 # check for cached result 337 if not 'resultXML' in docinfo: 338 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) 339 340 resultxml = docinfo.get('resultXML', None) 341 if not resultxml: 342 logging.error("getResultPage: unable to find resultXML") 343 return "Error: no result!" 344 345 if size is None: 346 size = pageinfo.get('resultPageSize', 10) 347 348 if start is None: 349 start = (pn - 1) * size 350 351 fullresult = ET.fromstring(resultxml) 352 353 if fullresult is not None: 354 # paginate 355 first = start-1 356 len = size 357 del fullresult[:first] 358 del fullresult[len:] 359 tocdivs = fullresult 360 361 # check all a-tags 362 links = tocdivs.findall(".//a") 363 for l in links: 364 href = l.get('href') 365 if href: 366 # assume all links go to pages 367 linkUrl = urlparse.urlparse(href) 368 linkParams = urlparse.parse_qs(linkUrl.query) 369 # take some parameters 370 params = {'pn': linkParams['pn'], 371 'highlightQuery': linkParams.get('highlightQuery',''), 372 'highlightElement': linkParams.get('highlightElement',''), 373 'highlightElementPos': linkParams.get('highlightElementPos','') 374 } 375 url = self.getLink(params=params) 376 l.set('href', url) 377 378 return serialize(tocdivs) 379 380 return "ERROR: no results!" 381 382 456 383 def getToc(self, mode="text", docinfo=None): 457 """loads table of contents and stores in docinfo""" 384 """loads table of contents and stores XML in docinfo""" 385 logging.debug("getToc mode=%s"%mode) 458 386 if mode == "none": 459 return docinfo 387 return docinfo 388 460 389 if 'tocSize_%s'%mode in docinfo: 461 390 # cached toc … … 473 402 tocSize = 0 474 403 tocDiv = None 475 404 # fetch full toc 476 405 pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 477 478 # post-processing downloaded xml 479 pagedom = Parse(pagexml) 480 # get number of entries 481 numdivs = pagedom.xpath("//div[@class='queryResultHits']") 482 if len(numdivs) > 0: 483 tocSize = int(getTextFromNode(numdivs[0])) 484 docinfo['tocSize_%s'%mode] = tocSize 406 dom = ET.fromstring(pagexml) 407 # page content is in <div class="queryResultPage"> 408 pagediv = None 409 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] 410 alldivs = dom.findall("div") 411 for div in alldivs: 412 dc = div.get('class') 413 # page content div 414 if dc == 'queryResultPage': 415 pagediv = div 416 417 elif dc == 'queryResultHits': 418 docinfo['tocSize_%s'%mode] = getInt(div.text) 419 420 if pagediv is not None: 421 # store XML in docinfo 422 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') 423 485 424 return docinfo 486 425 487 def getTocPage(self, mode="text", pn= 1, pageinfo=None, docinfo=None):426 def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None): 488 427 """returns single page from the table of contents""" 489 # TODO: this should use the cached TOC428 logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) 490 429 if mode == "text": 491 430 queryType = "toc" 492 431 else: 493 432 queryType = mode 494 docpath = docinfo['textURLPath'] 495 path = docinfo['textURLPath'] 496 pagesize = pageinfo['tocPageSize'] 497 pn = pageinfo['tocPN'] 498 url = docinfo['url'] 499 selfurl = self.absolute_url() 500 viewMode= pageinfo['viewMode'] 501 characterNormalization = pageinfo ['characterNormalization'] 502 #optionToggle =pageinfo ['optionToggle'] 503 tocMode = pageinfo['tocMode'] 504 tocPN = pageinfo['tocPN'] 505 506 data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) 507 page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) 508 text = page.replace('mode=image','mode=texttool') 509 return text 433 434 # check for cached TOC 435 if not docinfo.has_key('tocXML_%s'%mode): 436 self.getToc(mode=mode, docinfo=docinfo) 437 438 tocxml = docinfo.get('tocXML_%s'%mode, None) 439 if not tocxml: 440 logging.error("getTocPage: unable to find tocXML") 441 return "Error: no table of contents!" 442 443 if size is None: 444 size = pageinfo.get('tocPageSize', 30) 445 446 if start is None: 447 start = (pn - 1) * size 448 449 fulltoc = ET.fromstring(tocxml) 450 451 if fulltoc is not None: 452 # paginate 453 first = (start - 1) * 2 454 len = size * 2 455 del fulltoc[:first] 456 del fulltoc[len:] 457 tocdivs = fulltoc 458 459 # check all a-tags 460 links = tocdivs.findall(".//a") 461 for l in links: 462 href = l.get('href') 463 if href: 464 # take pn from href 465 m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) 466 if m is not None: 467 # and create new url (assuming parent is documentViewer) 468 url = self.getLink('pn', m.group(1)) 469 l.set('href', url) 470 else: 471 logging.warning("getTocPage: Problem with link=%s"%href) 472 473 # fix two-divs-per-row with containing div 474 newtoc = ET.Element('div', {'class':'queryResultPage'}) 475 for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): 476 e = ET.Element('div',{'class':'tocline'}) 477 e.append(d1) 478 e.append(d2) 479 newtoc.append(e) 480 481 return serialize(newtoc) 482 483 return "ERROR: no table of contents!" 484 510 485 511 486 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 512 #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):513 487 """change settings""" 514 488 self.title=title … … 531 505 if RESPONSE is not None: 532 506 RESPONSE.redirect('manage_main') 507 508
Note: See TracChangeset
for help on using the changeset viewer.