# HG changeset patch # User casties # Date 1350484573 -7200 # Node ID b2c7e272e0752b54b7b817d4bce4151291f8696f # Parent f0e5e9c6737fa192bab7c7b2cb200d80055327c1 new w-tag solution with etree. search works now. diff -r f0e5e9c6737f -r b2c7e272e075 MpiwgXmlTextServer.py --- a/MpiwgXmlTextServer.py Tue Oct 16 19:46:53 2012 +0200 +++ b/MpiwgXmlTextServer.py Wed Oct 17 16:36:13 2012 +0200 @@ -9,6 +9,8 @@ import urlparse import base64 +from datetime import datetime + from SrvTxtUtils import getInt, getText, getHttpData def serialize(node): @@ -186,6 +188,7 @@ """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) + startTime = datetime.now() # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") @@ -208,7 +211,10 @@ # TODO: change values in form if normMode == 'regPlusNorm': normMode = 'norm' - + + # TODO: this should not be necessary when the backend is fixed + textParams['normalization'] = normMode + if not mode: # default is dict mode = 'text' @@ -240,20 +246,17 @@ # other modes don't combine if 'dict' in modes: textmode = 'dict' - textParams['mode'] = 'tokenized' textParams['outputFormat'] = 'html' elif 'xml' in modes: textmode = 'xml' - textParams['mode'] = 'untokenized' textParams['outputFormat'] = 'xmlDisplay' - textParams['normMode'] = 'orig' + normMode = 'orig' elif 'gis' in modes: #FIXME! textmode = 'gis' else: # text is default mode textmode = 'plain' - textParams['mode'] = 'untokenized' textParams['outputFormat'] = 'html' try: @@ -272,7 +275,7 @@ if pagediv is not None: # add textmode and normMode classes pagediv.set('class', 'text %s %s'%(textmode, normMode)) - #self._processWTags(textmode, normMode, pagediv) + self._processWTags(textmode, normMode, pagediv) #self._processPbTag(pagediv, pageinfo) self._processFigures(pagediv, docinfo) #self._fixEmptyDivs(pagediv) @@ -287,19 +290,15 @@ linkurl = urlparse.urlparse(href) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page - # is dictionary link - change href (keeping parameters) - #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) # add target to open new page l.set('target', '_blank') - - elif href.startswith('#note-'): - # note link FIXME! - l.set('href', href.replace('#note-',"%s#note-"%selfurl)) if punditMode: self._addPunditAttributes(pagediv, pageinfo, docinfo) - - return serialize(pagediv) + + s = serialize(pagediv) + logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) + return s # xml mode elif textmode == "xml": @@ -345,31 +344,50 @@ def _processWTags(self, textMode, normMode, pagediv): """selects the necessary information from w-spans and removes the rest from pagediv""" logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) + startTime = datetime.now() wtags = pagediv.findall(".//span[@class='w']") for wtag in wtags: - text = None - attr = None if textMode == 'dict': - # take a-tag and matching child - attr = wtag.find('a').items() - text = wtag.find("a/span[@class='%s']"%normMode).text + # delete non-a-tags + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + # delete non-matching children of a-tag and suppress remaining tag name + atag = wtag.find("a[@class='dictionary']") + if normMode == 'orig': + atag.remove(atag.find("span[@class='reg']")) + atag.remove(atag.find("span[@class='norm']")) + atag.find("span[@class='orig']").tag = None + elif normMode == 'reg': + atag.remove(atag.find("span[@class='orig']")) + atag.remove(atag.find("span[@class='norm']")) + atag.find("span[@class='reg']").tag = None + elif normMode == 'norm': + atag.remove(atag.find("span[@class='orig']")) + atag.remove(atag.find("span[@class='reg']")) + atag.find("span[@class='norm']").tag = None + else: - # take matching child - text = wtag.find("span[@class='nodictionary %s']"%normMode).text + # delete a-tag + wtag.remove(wtag.find("a[@class='dictionary']")) + # delete non-matching children and suppress remaining tag name + if normMode == 'orig': + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + wtag.find("span[@class='nodictionary orig']").tag = None + elif normMode == 'reg': + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + wtag.find("span[@class='nodictionary reg']").tag = None + elif normMode == 'norm': + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.find("span[@class='nodictionary norm']").tag = None - if text: - # replace wtag by new content - logging.debug("new w-tag attr=%s text=%s"%(attr,text)) - wtag.clear() - - if attr: - # make dictionary link - wtag.tag = 'a' - wtag.attrib.update(dict(attr)) - - # text content - wtag.text = text - + # suppress w-tag name + wtag.tag = None + + logging.debug("processWTags in %s"%(datetime.now()-startTime)) return pagediv def _processPbTag(self, pagediv, pageinfo): @@ -409,10 +427,14 @@ def _processFigures(self, pagediv, docinfo): """processes figure-tags""" - divs = pagediv.findall(".//span[@class='figure']") + # unfortunately etree can not select class.startswith('figure') + divs = pagediv.findall(".//span[@class]") scalerUrl = docinfo['digilibScalerUrl'] viewerUrl = docinfo['digilibViewerUrl'] for d in divs: + if not d.get('class').startswith('figure'): + continue + try: a = d.find('a') img = a.find('img') @@ -484,8 +506,14 @@ try: dom = ET.fromstring(pagexml) # page content is currently in multiple