from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import xml.etree.ElementTree as ET import re import logging import urllib import urlparse import base64 from SrvTxtUtils import getInt, getText, getHttpData, serialize class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" meta_type="MPDL-XML TextServer" manage_options=( {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, )+SimpleItem.manage_options manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40, repositoryType='production'): """constructor""" self.id=id self.title=title self.timeout = timeout self.repositoryType = repositoryType if serverName is None: self.serverUrl = serverUrl else: self.serverUrl = "http://%s/mpdl/interface/"%serverName def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" return getHttpData(url,data,timeout=self.timeout) def getServerData(self, method, data=None): """returns result from text server for method+data""" url = self.serverUrl+method return getHttpData(url,data,timeout=self.timeout) def getRepositoryType(self): """returns the repository type, e.g. 'production'""" return getattr(self, 'repositoryType', None) def getTextDownloadUrl(self, type='xml', docinfo=None): """returns a URL to download the current text""" docpath = docinfo.get('textURLPath', None) if not docpath: return None docpath = docpath.replace('.xml','.'+type) url = '%sgetDoc?doc=%s'%(self.serverUrl.replace('interface/',''), docpath) return url def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" docpath = docinfo.get('textURLPath',None) if not docpath: return None places=[] text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) dom = ET.fromstring(text) result = dom.findall(".//resultPage/place") for l in result: id = l.get("id") name = l.text place = {'id': id, 'name': name} places.append(place) return places def getTextInfo(self, mode='', docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) if mode not in ['toc', 'figures', '']: mode = '' # check cached info if mode: # cached toc-request? if 'full_%s'%mode in docinfo: return docinfo else: # no toc-request if 'numTextPages' in docinfo: return docinfo docpath = docinfo.get('textURLPath', None) if docpath is None: logging.error("getTextInfo: no textURLPath!") return docinfo try: # we need to set a result set size pagesize = 10000 pn = 1 # fetch docinfo pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn)) dom = ET.fromstring(pagexml) # all info in tag doc = dom.find("document") except Exception, e: logging.error("getTextInfo: Error reading doc info: %s"%e) return docinfo if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: # go through all child elements for tag in doc: name = tag.tag # numTextPages if name == 'countPages': np = getInt(tag.text) if np > 0: docinfo['numTextPages'] = np # numFigureEntries elif name == 'countFigureEntries': docinfo['numFigureEntries'] = getInt(tag.text) # numTocEntries elif name == 'countTocEntries': # WTF: s1 = int(s)/30+1 docinfo['numTocEntries'] = getInt(tag.text) # numPlaces elif name == 'countPlaces': docinfo['numPlaces'] = getInt(tag.text) # pageNumbers elif name == 'pageNumbers': # contains tags with page numbers # 44 # n=scan number, no=original page no, non=normalized original page no # pageNumbers is a dict indexed by scan number pages = {} for pn in tag: page = {} n = 0 for p in pn: if p.tag == 'n': n = getInt(p.text) page['pn'] = n elif p.tag == 'no': page['no'] = p.text elif p.tag == 'non': page['non'] = p.text if n > 0: pages[n] = page docinfo['pageNumbers'] = pages #logging.debug("got pageNumbers=%s"%repr(pages)) # toc elif name == 'toc': # contains tags with table of contents/figures # 133Chapter I1.1 tocs = [] for te in tag: toc = {} for t in te: if t.tag == 'page': toc['pn'] = getInt(t.text) elif t.tag == 'level': toc['level'] = t.text elif t.tag == 'content': toc['content'] = t.text elif t.tag == 'level-string': toc['level-string'] = t.text elif t.tag == 'real-level': toc['real-level'] = t.text tocs.append(toc) # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs return docinfo def processPageInfo(self, dom, docinfo, pageinfo): """processes page info divs from dom and stores in docinfo and pageinfo""" # assume first second level div is pageMeta alldivs = dom.find("div") if alldivs is None or alldivs.get('class', '') != 'pageMeta': logging.error("processPageInfo: pageMeta div not found!") return for div in alldivs: dc = div.get('class') # pageNumberOrig if dc == 'pageNumberOrig': pageinfo['pageNumberOrig'] = div.text # pageNumberOrigNorm elif dc == 'pageNumberOrigNorm': pageinfo['pageNumberOrigNorm'] = div.text # pageHeaderTitle elif dc == 'pageHeaderTitle': pageinfo['pageHeaderTitle'] = div.text #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) return def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") return pageinfo['textPage'] docpath = docinfo.get('textURLPath', None) if not docpath: return None # stuff for constructing full urls selfurl = docinfo['viewerUrl'] textParams = {'document': docpath, 'pn': pn} if 'characterNormalization' in pageinfo: textParams['characterNormalization'] = pageinfo['characterNormalization'] if not mode: # default is dict mode = 'text' modes = mode.split(',') # check for multiple layers if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) # search mode if 'search' in modes: # add highlighting highlightQuery = pageinfo.get('highlightQuery', None) if highlightQuery: textParams['highlightQuery'] = highlightQuery textParams['highlightElement'] = pageinfo.get('highlightElement', '') textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') # ignore mode in the following modes.remove('search') # pundit mode punditMode = False if 'pundit' in modes: punditMode = True # ignore mode in the following modes.remove('pundit') # other modes don't combine if 'dict' in modes: # dict is called textPollux in the backend textmode = 'textPollux' elif 'xml' in modes: # xml mode textmode = 'xml' textParams['characterNormalization'] = 'orig' elif 'gis' in modes: textmode = 'gis' else: # text is default mode textmode = 'text' textParams['mode'] = textmode try: # fetch the page pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) dom = ET.fromstring(pagexml) except Exception, e: logging.error("getTextPage: Error reading page: %s"%e) return None # extract additional info self.processPageInfo(dom, docinfo, pageinfo) # page content is in
pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] # so we look at the second level divs alldivs = dom.findall('div') for div in alldivs: dc = div.get('class') # page content div if dc == 'pageContent': pagediv = div break # plain text mode if textmode == "text": # get full url assuming documentViewer is parent selfurl = self.getLink() if pagediv is not None: if punditMode: pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags divs = pagediv.findall('.//div') for d in divs: if len(d) == 0 and not d.text: # make empty divs non-empty d.text = ' ' # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') if href and href.startswith('#note-'): href = href.replace('#note-',"%s#note-"%selfurl) l.set('href', href) return serialize(pagediv) # text-with-links mode elif textmode == "textPollux": if pagediv is not None: viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() if punditMode: pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags divs = pagediv.findall('.//div') for d in divs: if len(d) == 0 and not d.text: # make empty divs non-empty d.text = ' ' # check all a-tags links = pagediv.findall(".//a") for l in links: href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # is dictionary link - change href (keeping parameters) #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) # add target to open new page l.set('target', '_blank') if href.startswith('#note-'): # note link l.set('href', href.replace('#note-',"%s#note-"%selfurl)) return serialize(pagediv) # xml mode elif textmode == "xml": if pagediv is not None: return serialize(pagediv) # pureXml mode WTF? elif textmode == "pureXml": if pagediv is not None: return serialize(pagediv) # gis mode elif textmode == "gis": if pagediv is not None: # fix empty div tags divs = pagediv.findall('.//div') for d in divs: if len(d) == 0 and not d.text: # make empty divs non-empty d.text = ' ' # check all a-tags links = pagediv.findall(".//a") # add our URL as backlink selfurl = self.getLink() doc = base64.b64encode(selfurl) for l in links: href = l.get('href') if href: if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) l.set('target', '_blank') return serialize(pagediv) return None def addPunditAttributes(self, pagediv, pageinfo, docinfo): """add about attributes for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') # TODO: use pn as well? # check all div-tags divs = pagediv.findall(".//div") for d in divs: id = d.get('id') if id: d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) cls = d.get('class','') cls += ' pundit-content' d.set('class', cls.strip()) return pagediv def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) if mode == "none": return docinfo cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result if cachedQuery == '%s_%s'%(mode,query): # same query return docinfo else: # different query del docinfo['resultSize'] del docinfo['resultXML'] # cache query docinfo['cachedQuery'] = '%s_%s'%(mode,query) # fetch full results docpath = docinfo['textURLPath'] params = {'document': docpath, 'mode': 'text', 'queryType': mode, 'query': query, 'queryResultPageSize': 1000, 'queryResultPN': 1, 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) dom = ET.fromstring(pagexml) # page content is in
pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'queryResultPage': pagediv = div elif dc == 'queryResultHits': docinfo['resultSize'] = getInt(div.text) if pagediv is not None: # store XML in docinfo docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') return docinfo def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) # get (cached) result self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('resultXML', None) if not resultxml: logging.error("getResultPage: unable to find resultXML") return "Error: no result!" if size is None: size = pageinfo.get('resultPageSize', 10) if start is None: start = (pn - 1) * size fullresult = ET.fromstring(resultxml) if fullresult is not None: # paginate first = start-1 len = size del fullresult[:first] del fullresult[len:] tocdivs = fullresult # check all a-tags links = tocdivs.findall(".//a") for l in links: href = l.get('href') if href: # assume all links go to pages linkUrl = urlparse.urlparse(href) linkParams = urlparse.parse_qs(linkUrl.query) # take some parameters params = {'pn': linkParams['pn'], 'highlightQuery': linkParams.get('highlightQuery',''), 'highlightElement': linkParams.get('highlightElement',''), 'highlightElementPos': linkParams.get('highlightElementPos','') } url = self.getLink(params=params) l.set('href', url) return serialize(tocdivs) return "ERROR: no results!" def getToc(self, mode='text', docinfo=None): """returns list of table of contents from docinfo""" logging.debug("getToc mode=%s"%mode) if mode == 'text': queryType = 'toc' else: queryType = mode if not 'full_%s'%queryType in docinfo: # get new toc docinfo = self.getTextInfo(queryType, docinfo) return docinfo.get('full_%s'%queryType, []) def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) fulltoc = self.getToc(mode=mode, docinfo=docinfo) if len(fulltoc) < 1: logging.error("getTocPage: unable to find toc!") return "Error: no table of contents!" if size is None: size = pageinfo.get('tocPageSize', 30) if start is None: start = (pn - 1) * size # paginate first = (start - 1) last = first + size tocs = fulltoc[first:last] tp = '
' for toc in tocs: pageurl = self.getLink('pn', toc['pn']) tp += '
' tp += '
[%s %s]
'%(toc['level-string'], toc['content']) tp += ''%(pageurl, toc['pn']) tp += '
\n' tp += '
\n' return tp def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout self.serverUrl = serverUrl if repositoryType: self.repositoryType = repositoryType if RESPONSE is not None: RESPONSE.redirect('manage_main') # management methods def manage_addMpdlXmlTextServerForm(self): """Form for adding""" pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) return pt() def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """add zogiimage""" newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')