documentViewer/MpdlXmlTextServer.py - view

File: [Repository] / documentViewer / MpdlXmlTextServer.py
Revision 1.8: download - view: text, annotated - select for diffs - revision graph
Thu Aug 12 10:08:24 2010 UTC (13 years, 10 months ago) by abukhman
Branches: MAIN
CVS tags: HEAD

characterNormalization (reg,norm, none)

1: 2: from OFS.SimpleItem import SimpleItem 3: from Products.PageTemplates.PageTemplateFile import PageTemplateFile 4: 5: from Ft.Xml import EMPTY_NAMESPACE, Parse 6: 7: import sys 8: import logging 9: import urllib 10: import documentViewer 11: from documentViewer import getTextFromNode, serializeNode 12: 13: 14: class MpdlXmlTextServer(SimpleItem): 15: """TextServer implementation for MPDL-XML eXist server""" 16: meta_type="MPDL-XML TextServer" 17: 18: manage_options=( 19: {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, 20: )+SimpleItem.manage_options 21: 22: manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) 23: 24: def __init__(self,id,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 25: """constructor""" 26: self.id=id 27: self.title=title 28: self.timeout = timeout 29: if serverName is None: 30: self.serverUrl = serverUrl 31: else: 32: self.serverUrl = "http://%s/mpdl/interface/"%serverName 33: 34: 35: def getHttpData(self, url, data=None): 36: """returns result from url+data HTTP request""" 37: return documentViewer.getHttpData(url,data,timeout=self.timeout) 38: 39: 40: def getServerData(self, method, data=None): 41: """returns result from text server for method+data""" 42: url = self.serverUrl+method 43: return documentViewer.getHttpData(url,data,timeout=self.timeout) 44: 45: 46: def getSearch(self, pn=1, pageinfo=None, docinfo=None, query=None, queryType=None, lemma=None): 47: """get search list""" 48: docpath = docinfo['textURLPath'] 49: url = docinfo['url'] 50: logging.debug("documentViewer (gettoc) docpath: %s"%(docpath)) 51: logging.debug("documentViewer (gettoc) url: %s"%(url)) 52: pagesize = pageinfo['queryPageSize'] 53: pn = pageinfo['searchPN'] 54: sn = pageinfo['sn'] 55: highlightQuery = pageinfo['highlightQuery'] 56: query =pageinfo['query'] 57: queryType =pageinfo['queryType'] 58: viewMode= pageinfo['viewMode'] 59: tocMode = pageinfo['tocMode'] 60: characterNormalization = pageinfo ['characterNormalization'] 61: tocPN = pageinfo['tocPN'] 62: selfurl = self.absolute_url() 63: 64: data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&highlightQuery=%s"%(docpath, 'text', queryType, query, pagesize, pn, sn, viewMode,highlightQuery)) 65: #page=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&highlightQuery=%s"%(docpath, 'text', queryType, query, pagesize, pn, sn, viewMode,highlightQuery) ,outputUnicode=False) 66: 67: pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) 68: pagedom = Parse(pagexml) 69: if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): 70: pagedivs = pagedom.xpath("//div[@class='queryResultPage']") 71: if len(pagedivs)>0: 72: pagenode=pagedivs[0] 73: links=pagenode.xpath("//a") 74: for l in links: 75: hrefNode = l.getAttributeNodeNS(None, u"href") 76: if hrefNode: 77: href = hrefNode.nodeValue 78: if href.startswith('page-fragment.xql'): 79: selfurl = self.absolute_url() 80: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,query,pagesize,pn,tocMode,pn,tocPN,characterNormalization)) 81: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 82: return serializeNode(pagenode) 83: if (queryType=="fulltextMorph"): 84: pagedivs = pagedom.xpath("//div[@class='queryResult']") 85: if len(pagedivs)>0: 86: pagenode=pagedivs[0] 87: links=pagenode.xpath("//a") 88: for l in links: 89: hrefNode = l.getAttributeNodeNS(None, u"href") 90: if hrefNode: 91: href = hrefNode.nodeValue 92: if href.startswith('page-fragment.xql'): 93: selfurl = self.absolute_url() 94: pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,query,pagesize,pn,tocMode,pn,tocPN,characterNormalization)) 95: hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) 96: if href.startswith('../lt/lemma.xql'): 97: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma_New'%(selfurl)) 98: l.setAttributeNS(None, 'target', '_blank') 99: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 100: l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 101: pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") 102: return serializeNode(pagenode) 103: if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): 104: pagedivs= pagedom.xpath("//div[@class='queryResultPage']") 105: if len(pagedivs)>0: 106: pagenode=pagedivs[0] 107: links=pagenode.xpath("//a") 108: for l in links: 109: hrefNode = l.getAttributeNodeNS(None, u"href") 110: if hrefNode: 111: href = hrefNode.nodeValue 112: hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) 113: if href.startswith('../lt/lex.xql'): 114: hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_voc'%selfurl) 115: l.setAttributeNS(None, 'target', '_blank') 116: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 117: l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 118: if href.startswith('../lt/lemma.xql'): 119: hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%selfurl) 120: l.setAttributeNS(None, 'target', '_blank') 121: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") 122: l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 123: return serializeNode(pagenode) 124: return "no text here" 125: 126: def getNumPages(self, docinfo): 127: """get list of pages from fulltext and put in docinfo""" 128: if 'numPages' in docinfo: 129: # already there 130: return docinfo 131: 132: xquery = '//pb' 133: text = self.getServerData("xquery.xql","document=%s&xquery=%s"%(docinfo['textURLPath'],xquery)) 134: #text = self.template.fulltextclient.eval("/mpdl/interface/xquery.xql", "document=%s&xquery=%s"%(docinfo['textURLPath'],xquery)) 135: docinfo['numPages'] = text.count("<pb ") 136: return docinfo 137: 138: def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None, highlightQuery=None,sn=None, viewMode=None, tocMode=None, tocPN=None, characterNormalization=None): 139: """returns single page from fulltext""" 140: docpath = docinfo['textURLPath'] 141: path = docinfo['textURLPath'] 142: url = docinfo['url'] 143: viewMode= pageinfo['viewMode'] 144: tocMode = pageinfo['tocMode'] 145: characterNormalization = pageinfo ['characterNormalization'] 146: tocPN = pageinfo['tocPN'] 147: selfurl = self.absolute_url() 148: if mode == "text_dict": 149: textmode = "textPollux" 150: else: 151: textmode = mode 152: 153: textParam = "document=%s&mode=%s&pn=%s"%(docpath,textmode,pn) 154: if highlightQuery is not None: 155: textParam +="&highlightQuery=%s&sn=%s"%(highlightQuery,sn) 156: 157: pagexml = self.getServerData("page-fragment.xql",textParam) 158: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/page-fragment.xql", textParam, outputUnicode=False) 159: 160: pagedom = Parse(pagexml) 161: # plain text mode 162: if mode == "text": 163: # first div contains text 164: pagedivs = pagedom.xpath("/div") 165: if len(pagedivs) > 0: 166: pagenode = pagedivs[0] 167: links = pagenode.xpath("//a") 168: for l in links: 169: hrefNode = l.getAttributeNodeNS(None, u"href") 170: if hrefNode: 171: href= hrefNode.nodeValue 172: if href.startswith('#note-'): 173: hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&characterNormalization=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,characterNormalization,tocPN,pn)) 174: return serializeNode(pagenode) 175: if mode == "xml": 176: # first div contains text 177: pagedivs = pagedom.xpath("/div") 178: if len(pagedivs) > 0: 179: pagenode = pagedivs[0] 180: return serializeNode(pagenode) 181: if mode == "gis": 182: # first div contains text 183: pagedivs = pagedom.xpath("/div") 184: if len(pagedivs) > 0: 185: pagenode = pagedivs[0] 186: return serializeNode(pagenode) 187: 188: if mode == "pureXml": 189: # first div contains text 190: pagedivs = pagedom.xpath("/div") 191: if len(pagedivs) > 0: 192: pagenode = pagedivs[0] 193: return serializeNode(pagenode) 194: # text-with-links mode 195: if mode == "text_dict": 196: # first div contains text 197: pagedivs = pagedom.xpath("/div") 198: if len(pagedivs) > 0: 199: pagenode = pagedivs[0] 200: # check all a-tags 201: links = pagenode.xpath("//a") 202: for l in links: 203: hrefNode = l.getAttributeNodeNS(None, u"href") 204: if hrefNode: 205: # is link with href 206: href = hrefNode.nodeValue 207: if href.startswith('lt/lex.xql'): 208: # is pollux link 209: selfurl = self.absolute_url() 210: # change href 211: hrefNode.nodeValue = href.replace('lt/lex.xql','%s/template/head_main_voc'%selfurl) 212: # add target 213: l.setAttributeNS(None, 'target', '_blank') 214: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;") 215: l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 216: if href.startswith('lt/lemma.xql'): 217: selfurl = self.absolute_url() 218: hrefNode.nodeValue = href.replace('lt/lemma.xql','%s/template/head_main_lemma'%selfurl) 219: l.setAttributeNS(None, 'target', '_blank') 220: l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=700, scrollbars=1'); return false;") 221: l.setAttributeNS(None, 'onClick', 'popupWin.focus();') 222: if href.startswith('#note-'): 223: hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=%s&tocMode=%s&characterNormalization=%s&tocPN=%s&pn=%s#note-"%(url,viewMode,tocMode,characterNormalization,tocPN,pn)) 224: return serializeNode(pagenode) 225: return "no text here" 226: 227: def getTranslate(self, query=None, language=None): 228: """translate into another languages""" 229: data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) 230: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) 231: return data 232: 233: def getLemma(self, lemma=None, language=None): 234: """simular words lemma """ 235: data = self.getServerData("lt/lemma.xql","document=&language="+str(language)+"&lemma="+urllib.quote(lemma)) 236: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(lemma))) 237: return data 238: 239: def getLemmaNew(self, query=None, language=None): 240: """simular words lemma """ 241: data = self.getServerData("lt/lemma.xql","document=&language="+str(language)+"&lemma="+urllib.quote(query)) 242: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lemma.xql","document=&language="+str(language)+"&lemma="+url_quote(str(query))) 243: return data 244: 245: def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): 246: """number of""" 247: docpath = docinfo['textURLPath'] 248: pagesize = pageinfo['queryPageSize'] 249: pn = pageinfo['searchPN'] 250: query =pageinfo['query'] 251: queryType =pageinfo['queryType'] 252: tocSearch = 0 253: tocDiv = None 254: 255: pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn)) 256: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, query, pagesize, pn) ,outputUnicode=False) 257: pagedom = Parse(pagexml) 258: numdivs = pagedom.xpath("//div[@class='queryResultHits']") 259: tocSearch = int(getTextFromNode(numdivs[0])) 260: tc=int((tocSearch/10)+1) 261: logging.debug("documentViewer (gettoc) tc: %s"%(tc)) 262: return tc 263: 264: def getToc(self, mode="text", docinfo=None): 265: """loads table of contents and stores in docinfo""" 266: logging.debug("documentViewer (gettoc) mode: %s"%(mode)) 267: if mode == "none": 268: return docinfo 269: if 'tocSize_%s'%mode in docinfo: 270: # cached toc 271: return docinfo 272: 273: docpath = docinfo['textURLPath'] 274: # we need to set a result set size 275: pagesize = 1000 276: pn = 1 277: if mode == "text": 278: queryType = "toc" 279: else: 280: queryType = mode 281: # number of entries in toc 282: tocSize = 0 283: tocDiv = None 284: 285: pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 286: #pagexml=self.template.fulltextclient.eval("/mpdl/interface/doc-query.xql", "document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType,pagesize,pn), outputUnicode=False) 287: # post-processing downloaded xml 288: pagedom = Parse(pagexml) 289: # get number of entries 290: numdivs = pagedom.xpath("//div[@class='queryResultHits']") 291: if len(numdivs) > 0: 292: tocSize = int(getTextFromNode(numdivs[0])) 293: docinfo['tocSize_%s'%mode] = tocSize 294: return docinfo 295: 296: def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): 297: """returns single page from the table of contents""" 298: # TODO: this should use the cached TOC 299: if mode == "text": 300: queryType = "toc" 301: else: 302: queryType = mode 303: docpath = docinfo['textURLPath'] 304: path = docinfo['textURLPath'] 305: pagesize = pageinfo['tocPageSize'] 306: pn = pageinfo['tocPN'] 307: url = docinfo['url'] 308: selfurl = self.absolute_url() 309: viewMode= pageinfo['viewMode'] 310: characterNormalization =pageinfo ['characterNormalization'] 311: tocMode = pageinfo['tocMode'] 312: tocPN = pageinfo['tocPN'] 313: 314: data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) 315: 316: page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&characterNormalization=%s'%(selfurl,url, viewMode, tocMode, tocPN, characterNormalization)) 317: text = page.replace('mode=image','mode=texttool') 318: return text 319: 320: def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 321: """change settings""" 322: self.title=title 323: self.timeout = timeout 324: self.serverUrl = serverUrl 325: if RESPONSE is not None: 326: RESPONSE.redirect('manage_main') 327: 328: # management methods 329: def manage_addMpdlXmlTextServerForm(self): 330: """Form for adding""" 331: pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) 332: return pt() 333: 334: def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): 335: """add zogiimage""" 336: newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) 337: self.Destination()._setObject(id, newObj) 338: if RESPONSE is not None: 339: RESPONSE.redirect('manage_main') 340: 341: 342: