# HG changeset patch # User casties # Date 1350484573 -7200 # Node ID b2c7e272e0752b54b7b817d4bce4151291f8696f # Parent f0e5e9c6737fa192bab7c7b2cb200d80055327c1 new w-tag solution with etree. search works now. diff -r f0e5e9c6737f -r b2c7e272e075 MpiwgXmlTextServer.py --- a/MpiwgXmlTextServer.py Tue Oct 16 19:46:53 2012 +0200 +++ b/MpiwgXmlTextServer.py Wed Oct 17 16:36:13 2012 +0200 @@ -9,6 +9,8 @@ import urlparse import base64 +from datetime import datetime + from SrvTxtUtils import getInt, getText, getHttpData def serialize(node): @@ -186,6 +188,7 @@ """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) + startTime = datetime.now() # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") @@ -208,7 +211,10 @@ # TODO: change values in form if normMode == 'regPlusNorm': normMode = 'norm' - + + # TODO: this should not be necessary when the backend is fixed + textParams['normalization'] = normMode + if not mode: # default is dict mode = 'text' @@ -240,20 +246,17 @@ # other modes don't combine if 'dict' in modes: textmode = 'dict' - textParams['mode'] = 'tokenized' textParams['outputFormat'] = 'html' elif 'xml' in modes: textmode = 'xml' - textParams['mode'] = 'untokenized' textParams['outputFormat'] = 'xmlDisplay' - textParams['normMode'] = 'orig' + normMode = 'orig' elif 'gis' in modes: #FIXME! textmode = 'gis' else: # text is default mode textmode = 'plain' - textParams['mode'] = 'untokenized' textParams['outputFormat'] = 'html' try: @@ -272,7 +275,7 @@ if pagediv is not None: # add textmode and normMode classes pagediv.set('class', 'text %s %s'%(textmode, normMode)) - #self._processWTags(textmode, normMode, pagediv) + self._processWTags(textmode, normMode, pagediv) #self._processPbTag(pagediv, pageinfo) self._processFigures(pagediv, docinfo) #self._fixEmptyDivs(pagediv) @@ -287,19 +290,15 @@ linkurl = urlparse.urlparse(href) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page - # is dictionary link - change href (keeping parameters) - #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) # add target to open new page l.set('target', '_blank') - - elif href.startswith('#note-'): - # note link FIXME! - l.set('href', href.replace('#note-',"%s#note-"%selfurl)) if punditMode: self._addPunditAttributes(pagediv, pageinfo, docinfo) - - return serialize(pagediv) + + s = serialize(pagediv) + logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) + return s # xml mode elif textmode == "xml": @@ -345,31 +344,50 @@ def _processWTags(self, textMode, normMode, pagediv): """selects the necessary information from w-spans and removes the rest from pagediv""" logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) + startTime = datetime.now() wtags = pagediv.findall(".//span[@class='w']") for wtag in wtags: - text = None - attr = None if textMode == 'dict': - # take a-tag and matching child - attr = wtag.find('a').items() - text = wtag.find("a/span[@class='%s']"%normMode).text + # delete non-a-tags + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + # delete non-matching children of a-tag and suppress remaining tag name + atag = wtag.find("a[@class='dictionary']") + if normMode == 'orig': + atag.remove(atag.find("span[@class='reg']")) + atag.remove(atag.find("span[@class='norm']")) + atag.find("span[@class='orig']").tag = None + elif normMode == 'reg': + atag.remove(atag.find("span[@class='orig']")) + atag.remove(atag.find("span[@class='norm']")) + atag.find("span[@class='reg']").tag = None + elif normMode == 'norm': + atag.remove(atag.find("span[@class='orig']")) + atag.remove(atag.find("span[@class='reg']")) + atag.find("span[@class='norm']").tag = None + else: - # take matching child - text = wtag.find("span[@class='nodictionary %s']"%normMode).text + # delete a-tag + wtag.remove(wtag.find("a[@class='dictionary']")) + # delete non-matching children and suppress remaining tag name + if normMode == 'orig': + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + wtag.find("span[@class='nodictionary orig']").tag = None + elif normMode == 'reg': + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary norm']")) + wtag.find("span[@class='nodictionary reg']").tag = None + elif normMode == 'norm': + wtag.remove(wtag.find("span[@class='nodictionary orig']")) + wtag.remove(wtag.find("span[@class='nodictionary reg']")) + wtag.find("span[@class='nodictionary norm']").tag = None - if text: - # replace wtag by new content - logging.debug("new w-tag attr=%s text=%s"%(attr,text)) - wtag.clear() - - if attr: - # make dictionary link - wtag.tag = 'a' - wtag.attrib.update(dict(attr)) - - # text content - wtag.text = text - + # suppress w-tag name + wtag.tag = None + + logging.debug("processWTags in %s"%(datetime.now()-startTime)) return pagediv def _processPbTag(self, pagediv, pageinfo): @@ -409,10 +427,14 @@ def _processFigures(self, pagediv, docinfo): """processes figure-tags""" - divs = pagediv.findall(".//span[@class='figure']") + # unfortunately etree can not select class.startswith('figure') + divs = pagediv.findall(".//span[@class]") scalerUrl = docinfo['digilibScalerUrl'] viewerUrl = docinfo['digilibViewerUrl'] for d in divs: + if not d.get('class').startswith('figure'): + continue + try: a = d.find('a') img = a.find('img') @@ -484,8 +506,14 @@ try: dom = ET.fromstring(pagexml) # page content is currently in multiple - alldivs = dom.findall(".//td[@align='left']") + alldivs = dom.findall(".//tr[@class='hit']") for div in alldivs: + # change tr to div + div.tag = 'div' + # change td to span + for d in div.findall('td'): + d.tag = 'span' + # TODO: can we put etree in the session? results.append(div) @@ -516,34 +544,44 @@ if start is None: start = (pn - 1) * size - fullresult = ET.fromstring(resultxml) + #fullresult = ET.fromstring(resultxml) + #fullresult = resultxml + #logging.debug("resultxml=%s"%repr(resultxml)) - if fullresult is not None: + if resultxml is not None: # paginate first = start-1 - len = size - del fullresult[:first] - del fullresult[len:] - tocdivs = fullresult + last = first+size + tocdivs = resultxml[first:last] + #del fullresult[:first] + #del fullresult[len:] + #tocdivs = fullresult - # check all a-tags - links = tocdivs.findall(".//a") - for l in links: - href = l.get('href') - if href: - # assume all links go to pages - linkUrl = urlparse.urlparse(href) - linkParams = urlparse.parse_qs(linkUrl.query) - # take some parameters - params = {'pn': linkParams['pn'], - 'highlightQuery': linkParams.get('highlightQuery',''), - 'highlightElement': linkParams.get('highlightElement',''), - 'highlightElementPos': linkParams.get('highlightElementPos','') - } - url = self.getLink(params=params) - l.set('href', url) + toc = ET.Element('div', attrib={'class':'queryResultPage'}) + for div in tocdivs: + # check all a-tags + links = div.findall(".//a") + for l in links: + href = l.get('href') + if href: + # assume all links go to pages + linkUrl = urlparse.urlparse(href) + linkParams = urlparse.parse_qs(linkUrl.query) + # take some parameters (make sure it works even if the link was already parsed) + params = {'pn': linkParams.get('page',linkParams.get('pn', None)), + 'highlightQuery': linkParams.get('highlightQuery',None), + 'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)), + 'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None)) + } + if not params['pn']: + logging.warn("getResultsPage: link has no page: %s"%href) + + url = self.getLink(params=params) + l.set('href', url) - return serialize(tocdivs) + toc.append(div) + + return serialize(toc) return "ERROR: no results!" diff -r f0e5e9c6737f -r b2c7e272e075 css/docuviewer.css --- a/css/docuviewer.css Tue Oct 16 19:46:53 2012 +0200 +++ b/css/docuviewer.css Wed Oct 17 16:36:13 2012 +0200 @@ -268,7 +268,7 @@ margin-top: 0.5em; margin-bottom: 0.25em; } -/* normalization forms */ +/* normalization forms * div.col.main div.content.text div.text.orig span.w span.reg, div.col.main div.content.text div.text.orig span.w span.norm { display: none; @@ -281,13 +281,14 @@ div.col.main div.content.text div.text.norm span.w span.reg { display: none; } -/* dictionary forms */ +/* dictionary forms * div.col.main div.content.text div.text.plain span.w a.dictionary { display: none; } div.col.main div.content.text div.text.dict span.w span.nodictionary { display: none; -} +} +*/ /* page break */ div.col.main div.content.text span.pb span.n, div.col.main div.content.text span.pb span.o { @@ -300,7 +301,7 @@ margin-bottom: 1em; } /* note */ -div.col.main div.content.text span.note { +div.col.main div.content.text span.note span.noteBody { display: block; /* float: left; */ margin-top: 0.5em; @@ -309,7 +310,9 @@ border: 1px dashed silver; } div.col.main div.content.text span.note span.noteSign { - display: none; + display: none; + /* font-size: 70%; + vertical-align: super; */ } /* figure */ div.col.main div.content.text span.figure { @@ -354,6 +357,12 @@ div.col.results div.query { margin-bottom: 0.5em; } +div.col.results div.content div.hit { + margin-bottom: 0.5em; +} +div.col.results div.content div.hit span.hitLink { + margin-right: 0.5em; +} /* * index page @@ -399,6 +408,9 @@ font-family: Monaco,Courier,monospace; font-size: 12px; } +div.col.main div.content.xml ul { + padding-left: 1em; +} div.col.main div.content.xml div.pageHeaderTitle { display: none; }