changeset 576:b2c7e272e075

new w-tag solution with etree. search works now.
author casties
date Wed, 17 Oct 2012 16:36:13 +0200
parents f0e5e9c6737f
children 9251719154a3
files MpiwgXmlTextServer.py css/docuviewer.css
diffstat 2 files changed, 114 insertions(+), 64 deletions(-) [+]
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py	Tue Oct 16 19:46:53 2012 +0200
+++ b/MpiwgXmlTextServer.py	Wed Oct 17 16:36:13 2012 +0200
@@ -9,6 +9,8 @@
 import urlparse
 import base64
 
+from datetime import datetime
+
 from SrvTxtUtils import getInt, getText, getHttpData
 
 def serialize(node):
@@ -186,6 +188,7 @@
         """returns single page from fulltext"""
         
         logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
+        startTime = datetime.now()
         # check for cached text -- but ideally this shouldn't be called twice
         if pageinfo.has_key('textPage'):
             logging.debug("getTextPage: using cached text")
@@ -208,7 +211,10 @@
         # TODO: change values in form
         if normMode == 'regPlusNorm':
             normMode = 'norm'
-                        
+        
+        # TODO: this should not be necessary when the backend is fixed                
+        textParams['normalization'] = normMode
+        
         if not mode:
             # default is dict
             mode = 'text'
@@ -240,20 +246,17 @@
         # other modes don't combine
         if 'dict' in modes:
             textmode = 'dict'
-            textParams['mode'] = 'tokenized'
             textParams['outputFormat'] = 'html'
         elif 'xml' in modes:
             textmode = 'xml'
-            textParams['mode'] = 'untokenized'
             textParams['outputFormat'] = 'xmlDisplay'
-            textParams['normMode'] = 'orig'
+            normMode = 'orig'
         elif 'gis' in modes:
             #FIXME!
             textmode = 'gis'
         else:
             # text is default mode
             textmode = 'plain'
-            textParams['mode'] = 'untokenized'
             textParams['outputFormat'] = 'html'
         
         try:
@@ -272,7 +275,7 @@
             if pagediv is not None:
                 # add textmode and normMode classes
                 pagediv.set('class', 'text %s %s'%(textmode, normMode))
-                #self._processWTags(textmode, normMode, pagediv)
+                self._processWTags(textmode, normMode, pagediv)
                 #self._processPbTag(pagediv, pageinfo)
                 self._processFigures(pagediv, docinfo)
                 #self._fixEmptyDivs(pagediv)
@@ -287,19 +290,15 @@
                         linkurl = urlparse.urlparse(href)
                         if linkurl.path.endswith('GetDictionaryEntries'):
                             #TODO: replace wordInfo page
-                            # is dictionary link - change href (keeping parameters)
-                            #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
                             # add target to open new page
                             l.set('target', '_blank')
-                                                          
-                        elif href.startswith('#note-'):
-                            # note link FIXME!
-                            l.set('href', href.replace('#note-',"%s#note-"%selfurl))
                         
                 if punditMode:
                     self._addPunditAttributes(pagediv, pageinfo, docinfo)
-                    
-                return serialize(pagediv)
+                 
+                s = serialize(pagediv)
+                logging.debug("getTextPage done in %s"%(datetime.now()-startTime))    
+                return s
             
         # xml mode
         elif textmode == "xml":
@@ -345,31 +344,50 @@
     def _processWTags(self, textMode, normMode, pagediv):
         """selects the necessary information from w-spans and removes the rest from pagediv"""
         logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode)))
+        startTime = datetime.now()
         wtags = pagediv.findall(".//span[@class='w']")
         for wtag in wtags:
-            text = None
-            attr = None
             if textMode == 'dict':
-                # take a-tag and matching child
-                attr = wtag.find('a').items()
-                text = wtag.find("a/span[@class='%s']"%normMode).text
+                # delete non-a-tags
+                wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                # delete non-matching children of a-tag and suppress remaining tag name
+                atag = wtag.find("a[@class='dictionary']")
+                if normMode == 'orig':
+                    atag.remove(atag.find("span[@class='reg']"))
+                    atag.remove(atag.find("span[@class='norm']"))
+                    atag.find("span[@class='orig']").tag = None
+                elif normMode == 'reg':
+                    atag.remove(atag.find("span[@class='orig']"))
+                    atag.remove(atag.find("span[@class='norm']"))
+                    atag.find("span[@class='reg']").tag = None
+                elif normMode == 'norm':
+                    atag.remove(atag.find("span[@class='orig']"))
+                    atag.remove(atag.find("span[@class='reg']"))
+                    atag.find("span[@class='norm']").tag = None
+                    
             else:
-                # take matching child
-                text = wtag.find("span[@class='nodictionary %s']"%normMode).text
+                # delete a-tag
+                wtag.remove(wtag.find("a[@class='dictionary']"))
+                # delete non-matching children and suppress remaining tag name
+                if normMode == 'orig':
+                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                    wtag.find("span[@class='nodictionary orig']").tag = None
+                elif normMode == 'reg':
+                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                    wtag.find("span[@class='nodictionary reg']").tag = None
+                elif normMode == 'norm':
+                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                    wtag.find("span[@class='nodictionary norm']").tag = None
                 
-            if text:
-                # replace wtag by new content
-                logging.debug("new w-tag attr=%s text=%s"%(attr,text))
-                wtag.clear()
-                    
-                if attr:
-                    # make dictionary link
-                    wtag.tag = 'a'
-                    wtag.attrib.update(dict(attr))
-                    
-                # text content
-                wtag.text = text
-                
+            # suppress w-tag name
+            wtag.tag = None
+            
+        logging.debug("processWTags in %s"%(datetime.now()-startTime))
         return pagediv
         
     def _processPbTag(self, pagediv, pageinfo):
@@ -409,10 +427,14 @@
 
     def _processFigures(self, pagediv, docinfo):
         """processes figure-tags"""
-        divs = pagediv.findall(".//span[@class='figure']")
+        # unfortunately etree can not select class.startswith('figure')
+        divs = pagediv.findall(".//span[@class]")
         scalerUrl = docinfo['digilibScalerUrl']
         viewerUrl = docinfo['digilibViewerUrl']
         for d in divs:
+            if not d.get('class').startswith('figure'):
+                continue
+            
             try:
                 a = d.find('a')
                 img = a.find('img')
@@ -484,8 +506,14 @@
         try:
             dom = ET.fromstring(pagexml)
             # page content is currently in multiple <td align=left>
-            alldivs = dom.findall(".//td[@align='left']")
+            alldivs = dom.findall(".//tr[@class='hit']")
             for div in alldivs:
+                # change tr to div
+                div.tag = 'div'
+                # change td to span
+                for d in div.findall('td'):
+                    d.tag = 'span'
+                    
                 # TODO: can we put etree in the session?
                 results.append(div)
         
@@ -516,34 +544,44 @@
         if start is None:
             start = (pn - 1) * size
 
-        fullresult = ET.fromstring(resultxml)
+        #fullresult = ET.fromstring(resultxml)
+        #fullresult = resultxml
+        #logging.debug("resultxml=%s"%repr(resultxml))
         
-        if fullresult is not None:
+        if resultxml is not None:
             # paginate
             first = start-1
-            len = size
-            del fullresult[:first]
-            del fullresult[len:]
-            tocdivs = fullresult
+            last = first+size
+            tocdivs = resultxml[first:last]
+            #del fullresult[:first]
+            #del fullresult[len:]
+            #tocdivs = fullresult
             
-            # check all a-tags
-            links = tocdivs.findall(".//a")
-            for l in links:
-                href = l.get('href')
-                if href:
-                    # assume all links go to pages
-                    linkUrl = urlparse.urlparse(href)
-                    linkParams = urlparse.parse_qs(linkUrl.query)
-                    # take some parameters
-                    params = {'pn': linkParams['pn'],
-                              'highlightQuery': linkParams.get('highlightQuery',''),
-                              'highlightElement': linkParams.get('highlightElement',''),
-                              'highlightElementPos': linkParams.get('highlightElementPos','')
-                              }
-                    url = self.getLink(params=params)
-                    l.set('href', url)
+            toc = ET.Element('div', attrib={'class':'queryResultPage'})
+            for div in tocdivs:
+                # check all a-tags
+                links = div.findall(".//a")
+                for l in links:
+                    href = l.get('href')
+                    if href:
+                        # assume all links go to pages
+                        linkUrl = urlparse.urlparse(href)
+                        linkParams = urlparse.parse_qs(linkUrl.query)
+                        # take some parameters (make sure it works even if the link was already parsed)
+                        params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
+                                  'highlightQuery': linkParams.get('highlightQuery',None),
+                                  'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
+                                  'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
+                                  }
+                        if not params['pn']:
+                            logging.warn("getResultsPage: link has no page: %s"%href)
+                            
+                        url = self.getLink(params=params)
+                        l.set('href', url)
                         
-            return serialize(tocdivs)
+                toc.append(div)
+                        
+            return serialize(toc)
         
         return "ERROR: no results!"
 
--- a/css/docuviewer.css	Tue Oct 16 19:46:53 2012 +0200
+++ b/css/docuviewer.css	Wed Oct 17 16:36:13 2012 +0200
@@ -268,7 +268,7 @@
     margin-top: 0.5em;
     margin-bottom: 0.25em;
 }
-/* normalization forms */
+/* normalization forms *
 div.col.main div.content.text div.text.orig span.w span.reg,
 div.col.main div.content.text div.text.orig span.w span.norm {
     display: none;
@@ -281,13 +281,14 @@
 div.col.main div.content.text div.text.norm span.w span.reg {
     display: none;
 }
-/* dictionary forms */
+/* dictionary forms *
 div.col.main div.content.text div.text.plain span.w a.dictionary {
     display: none;
 }
 div.col.main div.content.text div.text.dict span.w span.nodictionary {
     display: none;
-}
+} 
+*/
 /* page break */
 div.col.main div.content.text span.pb span.n,
 div.col.main div.content.text span.pb span.o {
@@ -300,7 +301,7 @@
     margin-bottom: 1em;
 }
 /* note */
-div.col.main div.content.text span.note {
+div.col.main div.content.text span.note span.noteBody {
 	display: block;
 	/* float: left; */
     margin-top: 0.5em;
@@ -309,7 +310,9 @@
     border: 1px dashed silver;	
 }
 div.col.main div.content.text span.note span.noteSign {
-	display: none;
+    display: none;
+    /* font-size: 70%;
+	vertical-align: super; */
 }
 /* figure */
 div.col.main div.content.text span.figure {
@@ -354,6 +357,12 @@
 div.col.results div.query {
 	margin-bottom: 0.5em;
 }
+div.col.results div.content div.hit {
+    margin-bottom: 0.5em;
+}
+div.col.results div.content div.hit span.hitLink {
+    margin-right: 0.5em;
+}
 
 /*
  * index page
@@ -399,6 +408,9 @@
 	font-family: Monaco,Courier,monospace;
 	font-size: 12px;
 }
+div.col.main div.content.xml ul {
+    padding-left: 1em;
+}
 div.col.main div.content.xml div.pageHeaderTitle {
 	display: none;
 }