documentViewer: MpiwgXmlTextServer.py comparison

comparison MpiwgXmlTextServer.py @ 576:b2c7e272e075

new w-tag solution with etree. search works now.

author	casties
date	Wed, 17 Oct 2012 16:36:13 +0200
parents	f0e5e9c6737f
children	9251719154a3

comparison

equal deleted inserted replaced

-:f0e5e9c6737f
+:b2c7e272e075
 import re
 import logging
 import urllib
 import urlparse
 import base64
+from datetime import datetime
 from SrvTxtUtils import getInt, getText, getHttpData
 def serialize(node):
 """returns a string containing an XML snippet of node"""
 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
 """returns single page from fulltext"""
 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
+startTime = datetime.now()
 # check for cached text -- but ideally this shouldn't be called twice
 if pageinfo.has_key('textPage'):
 logging.debug("getTextPage: using cached text")
 return pageinfo['textPage']
 normMode = pageinfo.get('characterNormalization', 'reg')
 # TODO: change values in form
 if normMode == 'regPlusNorm':
 normMode = 'norm'
+# TODO: this should not be necessary when the backend is fixed
+textParams['normalization'] = normMode
 if not mode:
 # default is dict
 mode = 'text'
 modes = mode.split(',')
 modes.remove('pundit')
 # other modes don't combine
 if 'dict' in modes:
 textmode = 'dict'
-textParams['mode'] = 'tokenized'
 textParams['outputFormat'] = 'html'
 elif 'xml' in modes:
 textmode = 'xml'
-textParams['mode'] = 'untokenized'
 textParams['outputFormat'] = 'xmlDisplay'
-textParams['normMode'] = 'orig'
+normMode = 'orig'
 elif 'gis' in modes:
 #FIXME!
 textmode = 'gis'
 else:
 # text is default mode
 textmode = 'plain'
-textParams['mode'] = 'untokenized'
 textParams['outputFormat'] = 'html'
 try:
 # fetch the page
 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams))
 pagediv = dom.find(".//div[@class='text']")
 logging.debug("pagediv: %s"%repr(pagediv))
 if pagediv is not None:
 # add textmode and normMode classes
 pagediv.set('class', 'text %s %s'%(textmode, normMode))
-#self._processWTags(textmode, normMode, pagediv)
+self._processWTags(textmode, normMode, pagediv)
 #self._processPbTag(pagediv, pageinfo)
 self._processFigures(pagediv, docinfo)
 #self._fixEmptyDivs(pagediv)
 # get full url assuming documentViewer is parent
 selfurl = self.getLink()
 if href:
 # is link with href
 linkurl = urlparse.urlparse(href)
 if linkurl.path.endswith('GetDictionaryEntries'):
 #TODO: replace wordInfo page
-# is dictionary link - change href (keeping parameters)
-#l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
 # add target to open new page
 l.set('target', '_blank')
-elif href.startswith('#note-'):
-# note link FIXME!
-l.set('href', href.replace('#note-',"%s#note-"%selfurl))
 if punditMode:
 self._addPunditAttributes(pagediv, pageinfo, docinfo)
-return serialize(pagediv)
+s = serialize(pagediv)
+logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
+return s
 # xml mode
 elif textmode == "xml":
 # the text is in body
 pagediv = dom.find(".//body")
 return None
 def _processWTags(self, textMode, normMode, pagediv):
 """selects the necessary information from w-spans and removes the rest from pagediv"""
 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode)))
+startTime = datetime.now()
 wtags = pagediv.findall(".//span[@class='w']")
 for wtag in wtags:
-text = None
-attr = None
 if textMode == 'dict':
-# take a-tag and matching child
+# delete non-a-tags
-attr = wtag.find('a').items()
+wtag.remove(wtag.find("span[@class='nodictionary orig']"))
-text = wtag.find("a/span[@class='%s']"%normMode).text
+wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+# delete non-matching children of a-tag and suppress remaining tag name
+atag = wtag.find("a[@class='dictionary']")
+if normMode == 'orig':
+atag.remove(atag.find("span[@class='reg']"))
+atag.remove(atag.find("span[@class='norm']"))
+atag.find("span[@class='orig']").tag = None
+elif normMode == 'reg':
+atag.remove(atag.find("span[@class='orig']"))
+atag.remove(atag.find("span[@class='norm']"))
+atag.find("span[@class='reg']").tag = None
+elif normMode == 'norm':
+atag.remove(atag.find("span[@class='orig']"))
+atag.remove(atag.find("span[@class='reg']"))
+atag.find("span[@class='norm']").tag = None
 else:
-# take matching child
+# delete a-tag
-text = wtag.find("span[@class='nodictionary %s']"%normMode).text
+wtag.remove(wtag.find("a[@class='dictionary']"))
+# delete non-matching children and suppress remaining tag name
+if normMode == 'orig':
+wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+wtag.find("span[@class='nodictionary orig']").tag = None
+elif normMode == 'reg':
+wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+wtag.find("span[@class='nodictionary reg']").tag = None
+elif normMode == 'norm':
+wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+wtag.find("span[@class='nodictionary norm']").tag = None
-if text:
+# suppress w-tag name
-# replace wtag by new content
+wtag.tag = None
-logging.debug("new w-tag attr=%s text=%s"%(attr,text))
-wtag.clear()
+logging.debug("processWTags in %s"%(datetime.now()-startTime))
-if attr:
-# make dictionary link
-wtag.tag = 'a'
-wtag.attrib.update(dict(attr))
-# text content
-wtag.text = text
 return pagediv
 def _processPbTag(self, pagediv, pageinfo):
 """extracts information from pb-tag and removes it from pagediv"""
 pbdiv = pagediv.find(".//span[@class='pb']")
 return pagediv
 def _processFigures(self, pagediv, docinfo):
 """processes figure-tags"""
-divs = pagediv.findall(".//span[@class='figure']")
+# unfortunately etree can not select class.startswith('figure')
+divs = pagediv.findall(".//span[@class]")
 scalerUrl = docinfo['digilibScalerUrl']
 viewerUrl = docinfo['digilibViewerUrl']
 for d in divs:
+if not d.get('class').startswith('figure'):
+continue
 try:
 a = d.find('a')
 img = a.find('img')
 imgsrc = img.get('src')
 imgurl = urlparse.urlparse(imgsrc)
 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
 results = []
 try:
 dom = ET.fromstring(pagexml)
 # page content is currently in multiple <td align=left>
-alldivs = dom.findall(".//td[@align='left']")
+alldivs = dom.findall(".//tr[@class='hit']")
 for div in alldivs:
+# change tr to div
+div.tag = 'div'
+# change td to span
+for d in div.findall('td'):
+d.tag = 'span'
 # TODO: can we put etree in the session?
 results.append(div)
 except Exception, e:
 logging.error("GetSearchResults: Error parsing search result: %s"%e)
 size = pageinfo.get('resultPageSize', 10)
 if start is None:
 start = (pn - 1) * size
-fullresult = ET.fromstring(resultxml)
+#fullresult = ET.fromstring(resultxml)
+#fullresult = resultxml
-if fullresult is not None:
+#logging.debug("resultxml=%s"%repr(resultxml))
+if resultxml is not None:
 # paginate
 first = start-1
-len = size
+last = first+size
-del fullresult[:first]
+tocdivs = resultxml[first:last]
-del fullresult[len:]
+#del fullresult[:first]
-tocdivs = fullresult
+#del fullresult[len:]
+#tocdivs = fullresult
-# check all a-tags
-links = tocdivs.findall(".//a")
+toc = ET.Element('div', attrib={'class':'queryResultPage'})
-for l in links:
+for div in tocdivs:
-href = l.get('href')
+# check all a-tags
-if href:
+links = div.findall(".//a")
-# assume all links go to pages
+for l in links:
-linkUrl = urlparse.urlparse(href)
+href = l.get('href')
-linkParams = urlparse.parse_qs(linkUrl.query)
+if href:
-# take some parameters
+# assume all links go to pages
-params = {'pn': linkParams['pn'],
+linkUrl = urlparse.urlparse(href)
-'highlightQuery': linkParams.get('highlightQuery',''),
+linkParams = urlparse.parse_qs(linkUrl.query)
-'highlightElement': linkParams.get('highlightElement',''),
+# take some parameters (make sure it works even if the link was already parsed)
-'highlightElementPos': linkParams.get('highlightElementPos','')
+params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
-}
+'highlightQuery': linkParams.get('highlightQuery',None),
-url = self.getLink(params=params)
+'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
-l.set('href', url)
+'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
+}
+if not params['pn']:
+logging.warn("getResultsPage: link has no page: %s"%href)
+url = self.getLink(params=params)
+l.set('href', url)
-return serialize(tocdivs)
+toc.append(div)
+return serialize(toc)
 return "ERROR: no results!"
 def getToc(self, mode='text', docinfo=None):

Mercurial > hg > documentViewer

comparison MpiwgXmlTextServer.py @ 576:b2c7e272e075