from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import xml.etree.ElementTree as ET import re import logging import urllib import urlparse import base64 from datetime import datetime from SrvTxtUtils import getInt, getText, getHttpData, serialize # mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo textinfoFieldMap = { 'countPages' : 'numTextPages', 'countFigures' : 'numFigureEntries', 'countNotesHandwritten' : 'numHandwritten', 'countNotes' : 'numNotes', 'countPlaces' : 'numPlaces', 'countTocEntries' : 'numTocEntries' } class MpiwgXmlTextServer(SimpleItem): """TextServer implementation for MPIWG-XML server""" meta_type="MPIWG-XML TextServer" manage_options=( {'label':'Config','action':'manage_changeMpiwgXmlTextServerForm'}, )+SimpleItem.manage_options manage_changeMpiwgXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpiwgXmlTextServer", globals()) def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpiwg-mpdl-cms-web/", timeout=40, serverName=None, repositoryType='production'): """constructor""" self.id=id self.title=title self.timeout = timeout self.repositoryType = repositoryType if serverName is None: self.serverUrl = serverUrl else: self.serverUrl = "http://%s/mpiwg-mpdl-cms-web/"%serverName def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" return getHttpData(url,data,timeout=self.timeout) def getServerData(self, method, data=None): """returns result from text server for method+data""" url = self.serverUrl+method return getHttpData(url,data,timeout=self.timeout) def getRepositoryType(self): """returns the repository type, e.g. 'production'""" return getattr(self, 'repositoryType', None) def getTextDownloadUrl(self, type='xml', docinfo=None): """returns a URL to download the current text""" docpath = docinfo.get('textURLPath', None) if not docpath: return None docpath = docpath.replace('.xml','.'+type) url = '%sdoc/GetDocument?id=%s'%(self.serverUrl.replace('interface/',''), docpath) return url def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" logging.debug("getPlacesOnPage(pn=%s"%pn) if not 'places' in docinfo: self.getTextInfo('places', docinfo) allplaces = docinfo.get('places', None) if len(allplaces) == 0: return [] # search for places on this page TODO: is there a better way? places = [p for p in allplaces if p['pn'] == pn] return places """OLD: docpath = docinfo.get('textURLPath',None) if not docpath: return None places=[] text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) dom = ET.fromstring(text) result = dom.findall(".//resultPage/place") for l in result: id = l.get("id") name = l.text place = {'id': id, 'name': name} places.append(place) return places""" def getTextInfo(self, mode=None, docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) field = '' if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']: # translate mode to field param if mode == 'handwritten': field = '&field=notesHandwritten' else: field = '&field=%s'%mode else: mode = None # check cached info if mode: # cached toc-request? if 'full_%s'%mode in docinfo: return docinfo else: # cached but no toc-request? if 'numTextPages' in docinfo: return docinfo docpath = docinfo.get('textURLPath', None) if docpath is None: logging.error("getTextInfo: no textURLPath!") return docinfo # fetch docinfo pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) dom = ET.fromstring(pagexml) # all info in tag doc = dom if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: if mode is None: # get general info from system-tag sys = doc.find('system') if sys is not None: for (k,v) in textinfoFieldMap.items(): # copy into docinfo (even if empty) docinfo[v] = getInt(getText(sys.find(k))) else: # result is in list-tag l = doc.find('list') if l is not None: # look for general info for (k,v) in textinfoFieldMap.items(): # copy into docinfo (only if not empty) s = doc.find(k) if s is not None: docinfo[v] = getInt(getText(s)) lt = l.get('type') # # pageNumbers # if lt == 'pages': # contains tags with page numbers # # n=scan number, o=original page no, on=normalized original page no # pageNumbers is a dict indexed by scan number pages = {} for i in l: page = {} pn = getInt(i.get('n')) page['pn'] = pn no = i.get('o') page['no'] = no non = i.get('o-norm') page['non'] = non if pn > 0: pages[pn] = page docinfo['pageNumbers'] = pages # # toc # elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: # contains tags with table of contents/figures # CAP.I. 132 tocs = [] for te in l: if te.tag == 'item': toc = {} toc['level-string'] = te.get('n') toc['level'] = te.get('lv') toc['content'] = te.text.strip() ref = te.find('ref') toc['pn'] = getInt(ref.text) toc['no'] = ref.get('o') toc['non'] = ref.get('o-norm') tocs.append(toc) # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs # # places # # # toc # elif lt in ['places']: # contains tags with place-ids # 4 places = [] for p in l: if p.tag == 'item': place = {} place['id'] = p.get('id') ref = p.find('ref') place['pn'] = getInt(ref.text) places.append(place) docinfo['places'] = places return docinfo def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) startTime = datetime.now() # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") return pageinfo['textPage'] docpath = docinfo.get('textURLPath', None) if not docpath: return None # stuff for constructing full urls selfurl = docinfo['viewerUrl'] textParams = {'docId': docpath, 'page': pn} normMode = pageinfo.get('characterNormalization', 'reg') # TODO: change values in form if normMode == 'regPlusNorm': normMode = 'norm' # TODO: this should not be necessary when the backend is fixed #textParams['normalization'] = normMode if not mode: # default is dict mode = 'text' modes = mode.split(',') # check for multiple layers if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) # mode defaults gisMode = False punditMode = False # search mode if 'search' in modes: # add highlighting highlightQuery = pageinfo.get('highlightQuery', None) if highlightQuery: textParams['highlightQuery'] = highlightQuery textParams['highlightElem'] = pageinfo.get('highlightElement', '') textParams['highlightElemPos'] = pageinfo.get('highlightElementPos', '') # ignore mode in the following modes.remove('search') # pundit mode if 'pundit' in modes: punditMode = True # ignore mode in the following modes.remove('pundit') # other modes don't combine if 'dict' in modes: textmode = 'dict' textParams['outputFormat'] = 'html' elif 'xml' in modes: textmode = 'xml' textParams['outputFormat'] = 'xmlDisplay' normMode = 'orig' elif 'gis' in modes: gisMode = True # gis mode uses plain text textmode = 'plain' textParams['outputFormat'] = 'html' else: # text is default mode textmode = 'plain' textParams['outputFormat'] = 'html' try: # fetch the page pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) dom = ET.fromstring(pagexml) except Exception, e: logging.error("Error reading page: %s"%e) return None # plain text or text-with-links mode if textmode == 'plain' or textmode == 'dict': # the text is in div@class=text pagediv = dom.find(".//div[@class='text']") logging.debug("pagediv: %s"%repr(pagediv)) if pagediv is not None: # add textmode and normMode classes #pagediv.set('class', 'text %s %s'%(textmode, normMode)) self._processWTags(textmode, normMode, pagediv) #self._processPbTag(pagediv, pageinfo) self._processFigures(pagediv, docinfo) #self._fixEmptyDivs(pagediv) # get full url assuming documentViewer is parent selfurl = self.getLink() # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # add target to open new page l.set('target', '_blank') if punditMode: self._addPunditAttributes(pagediv, pageinfo, docinfo) if gisMode: self._addGisTags(pagediv, pageinfo, docinfo) s = serialize(pagediv) logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) return s # xml mode elif textmode == "xml": # the text is in body pagediv = dom.find(".//body") logging.debug("pagediv: %s"%repr(pagediv)) if pagediv is not None: return serialize(pagediv) logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) return None def _processWTags(self, textMode, normMode, pagediv): """selects the necessary information from w-spans and removes the rest from pagediv""" logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) startTime = datetime.now() wtags = pagediv.findall(".//span[@class='w']") for wtag in wtags: if textMode == 'dict': # delete non-a-tags wtag.remove(wtag.find("span[@class='nodictionary orig']")) wtag.remove(wtag.find("span[@class='nodictionary reg']")) wtag.remove(wtag.find("span[@class='nodictionary norm']")) # delete non-matching children of a-tag and suppress remaining tag name atag = wtag.find("*[@class='dictionary']") if atag is None: #nicht gefunden weil noch andere Eintraege im class tag for w in wtag.findall("a"): val = w.attrib.get("class","") if val.startswith("dictionary"): atag=w break if normMode == 'orig': atag.remove(atag.find("span[@class='reg']")) atag.remove(atag.find("span[@class='norm']")) atag.find("span[@class='orig']").tag = None elif normMode == 'reg': atag.remove(atag.find("span[@class='orig']")) atag.remove(atag.find("span[@class='norm']")) atag.find("span[@class='reg']").tag = None elif normMode == 'norm': atag.remove(atag.find("span[@class='orig']")) atag.remove(atag.find("span[@class='reg']")) atag.find("span[@class='norm']").tag = None else: # delete a-tag wt = wtag.find("*[@class='dictionary']") if wt is None: #nicht gefunden weil noch andere Eintraege im class tag vorhanden sind for w in wtag.findall("a"): val = w.attrib.get("class","") if val.startswith("dictionary"): wt=w break wtag.remove(wt) # delete non-matching children and suppress remaining tag name if normMode == 'orig': wtag.remove(wtag.find("span[@class='nodictionary reg']")) wtag.remove(wtag.find("span[@class='nodictionary norm']")) wtag.find("span[@class='nodictionary orig']").tag = None elif normMode == 'reg': wtag.remove(wtag.find("span[@class='nodictionary orig']")) wtag.remove(wtag.find("span[@class='nodictionary norm']")) wtag.find("span[@class='nodictionary reg']").tag = None elif normMode == 'norm': wtag.remove(wtag.find("span[@class='nodictionary orig']")) wtag.remove(wtag.find("span[@class='nodictionary reg']")) wtag.find("span[@class='nodictionary norm']").tag = None # suppress w-tag name wtag.tag = None logging.debug("processWTags in %s"%(datetime.now()-startTime)) return pagediv def _processPbTag(self, pagediv, pageinfo): """extracts information from pb-tag and removes it from pagediv""" pbdiv = pagediv.find(".//span[@class='pb']") if pbdiv is None: logging.warning("getTextPage: no pb-span!") return pagediv # extract running head rh = pbdiv.find(".//span[@class='rhead']") if rh is not None: pageinfo['pageHeaderTitle'] = getText(rh) # remove pb-div from parent ppdiv = pagediv.find(".//span[@class='pb']/..") ppdiv.remove(pbdiv) return pagediv def _addPunditAttributes(self, pagediv, pageinfo, docinfo): """add about-attributes to divs for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') # check all div-tags divs = pagediv.findall(".//div") for d in divs: id = d.get('id') if id: # TODO: check path (cf RFC2396) d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) cls = d.get('class','') cls += ' pundit-content' d.set('class', cls.strip()) return pagediv def _addGisTags(self, pagediv, pageinfo, docinfo): """add links for gis places""" # use last part of documentPath as db-id docpath = docinfo.get('documentPath', '') textid = docpath.split('/')[-1] # add our URL as backlink selfurl = self.getLink() doc = base64.b64encode(selfurl) # check all span@class=place spans = pagediv.findall(".//span[@class='place']") for s in spans: id = s.get('id') if id: # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis s.tag = 'a' # TODO: make links configurable url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc) s.set('href', url) s.set('target', '_blank') return pagediv def _processFigures(self, pagediv, docinfo): """processes figure-tags""" # unfortunately etree can not select class.startswith('figure') divs = pagediv.findall(".//span[@class]") scalerUrl = docinfo['digilibScalerUrl'] viewerUrl = docinfo['digilibViewerUrl'] for d in divs: if not d.get('class').startswith('figure'): continue try: a = d.find('a') img = a.find('img') imgsrc = img.get('src') imgurl = urlparse.urlparse(imgsrc) imgq = imgurl.query imgparams = urlparse.parse_qs(imgq) fn = imgparams.get('fn', None) if fn is not None: # parse_qs puts parameters in lists fn = fn[0] # TODO: check valid path # fix img@src newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn) img.set('src', newsrc) # fix a@href newlink = '%s?fn=%s'%(viewerUrl,fn) a.set('href', newlink) a.set('target', '_blank') except: logging.warn("processFigures: strange figure!") def _cleanSearchResult(self, pagediv): """fixes search result html (change pbs and figures)""" # replace figure-tag with figureNumText for fig in pagediv.findall(".//span[@class='figure']"): txt = fig.findtext(".//span[@class='figureNumText']") tail = fig.tail fig.clear() fig.set('class', 'figure') fig.text = txt fig.tail = tail # replace lb-tag with "//" for lb in pagediv.findall(".//br[@class='lb']"): lb.tag = 'span' lb.text = '//' # replace pb-tag with "///" for pb in pagediv.findall(".//span[@class='pb']"): tail = pb.tail pb.clear() pb.set('class', 'pb') pb.text = '///' pb.tail = tail return pagediv def _cleanSearchResult2(self, pagediv): """fixes search result html (change pbs and figures)""" # unfortunately etree can not select class.startswith('figure') divs = pagediv.findall(".//span[@class]") for d in divs: cls = d.get('class') if cls.startswith('figure'): # replace figure-tag with figureNumText txt = d.findtext(".//span[@class='figureNumText']") d.clear() d.set('class', 'figure') d.text = txt elif cls.startswith('pb'): # replace pb-tag with "//" d.clear() d.set('class', 'pb') d.text = '//' return pagediv def _fixEmptyDivs(self, pagediv): """fixes empty div-tags by inserting a space""" divs = pagediv.findall('.//div') for d in divs: if len(d) == 0 and not d.text: # make empty divs non-empty d.text = ' ' return pagediv def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" normMode = pageinfo.get('characterNormalization', 'reg') logging.debug("getSearchResults mode=%s query=%s norm=%s"%(mode, query, normMode)) if mode == "none": return docinfo #TODO: put mode into query cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result if cachedQuery == '%s_%s_%s'%(mode,query,normMode): # same query return docinfo else: # different query del docinfo['resultSize'] del docinfo['results'] # cache query docinfo['cachedQuery'] = '%s_%s_%s'%(mode,query,normMode) # fetch full results docpath = docinfo['textURLPath'] params = {'docId': docpath, 'query': query, 'pageSize': 1000, 'page': 1, 'outputFormat': 'html'} pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) results = [] try: dom = ET.fromstring(pagexml) # clean html output self._processWTags('plain', normMode, dom) self._cleanSearchResult(dom) # page content is currently in multiple alldivs = dom.findall(".//tr[@class='hit']") for div in alldivs: # change tr to div div.tag = 'div' # change td to span for d in div.findall('td'): d.tag = 'span' # TODO: can we put etree in the session? results.append(div) except Exception, e: logging.error("GetSearchResults: Error parsing search result: %s"%e) # store results in docinfo docinfo['resultSize'] = len(results) docinfo['results'] = results return docinfo def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the list of search results""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) # get (cached) result self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('results', None) if not resultxml: logging.error("getResultPage: unable to find results") return "Error: no result!" if size is None: size = pageinfo.get('resultPageSize', 10) if start is None: start = (pn - 1) * size if resultxml is not None: # paginate first = start-1 last = first+size tocdivs = resultxml[first:last] toc = ET.Element('div', attrib={'class':'queryResultPage'}) for div in tocdivs: # check all a-tags links = div.findall(".//a") for l in links: href = l.get('href') if href: # assume all links go to pages linkUrl = urlparse.urlparse(href) linkParams = urlparse.parse_qs(linkUrl.query) # take some parameters (make sure it works even if the link was already parsed) params = {'pn': linkParams.get('page',linkParams.get('pn', None)), 'highlightQuery': linkParams.get('highlightQuery',None), 'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)), 'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None)) } if not params['pn']: logging.warn("getResultsPage: link has no page: %s"%href) url = self.getLink(params=params) l.set('href', url) toc.append(div) return serialize(toc) return "ERROR: no results!" def getToc(self, mode='text', docinfo=None): """returns list of table of contents from docinfo""" logging.debug("getToc mode=%s"%mode) if mode == 'text': queryType = 'toc' else: queryType = mode if not 'full_%s'%queryType in docinfo: # get new toc docinfo = self.getTextInfo(queryType, docinfo) return docinfo.get('full_%s'%queryType, []) def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) fulltoc = self.getToc(mode=mode, docinfo=docinfo) if len(fulltoc) < 1: logging.error("getTocPage: unable to find toc!") return "Error: no table of contents!" if size is None: size = pageinfo.get('tocPageSize', 30) if start is None: start = (pn - 1) * size # paginate first = (start - 1) last = first + size tocs = fulltoc[first:last] tp = '
' label = {'figures': 'Figure', 'notes': 'Note', 'handwritten': 'Handwritten note'}.get(mode, 'Item') for toc in tocs: pageurl = self.getLink('pn', toc['pn']) tp += '
' content = toc['content'] lvs = toc['level-string'] if content: tp += '
[%s] %s
'%(lvs, toc['content']) elif lvs: tp += '
[%s %s]
'%(label, lvs) else: tp += '
[%s]
'%(label) if toc.get('no', None): tp += ''%(pageurl, toc['pn'], toc['no']) else: tp += ''%(pageurl, toc['pn']) tp += '
\n' tp += '
\n' return tp def manage_changeMpiwgXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout self.serverUrl = serverUrl if repositoryType: self.repositoryType = repositoryType if RESPONSE is not None: RESPONSE.redirect('manage_main') # management methods def manage_addMpiwgXmlTextServerForm(self): """Form for adding""" pt = PageTemplateFile("zpt/manage_addMpiwgXmlTextServer", globals()).__of__(self) return pt() def manage_addMpiwgXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): """add MpiwgXmlTextServer""" newObj = MpiwgXmlTextServer(id=id,title=title,serverUrl=serverUrl,timeout=timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')