Context Navigation

← Previous Changeset
Next Changeset →

Changeset 576:b2c7e272e075 in documentViewer

Timestamp:

Oct 17, 2012, 2:36:13 PM (13 years ago)

Author:

casties

Branch:

default

Message:

new w-tag solution with etree. search works now.

Files:

: 2 edited

MpiwgXmlTextServer.py (modified) (11 diffs)
css/docuviewer.css (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

MpiwgXmlTextServer.py

-                      r575
+                      r576
 import urlparse
 import base64
+from datetime import datetime
 from SrvTxtUtils import getInt, getText, getHttpData
 …
         logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
+        startTime = datetime.now()
         # check for cached text -- but ideally this shouldn't be called twice
         if pageinfo.has_key('textPage'):
 …
         if normMode == 'regPlusNorm':
             normMode = 'norm'
+        # TODO: this should not be necessary when the backend is fixed
+        textParams['normalization'] = normMode
         if not mode:
             # default is dict
 …
         if 'dict' in modes:
             textmode = 'dict'
-            textParams['mode'] = 'tokenized'
             textParams['outputFormat'] = 'html'
         elif 'xml' in modes:
             textmode = 'xml'
-            textParams['mode'] = 'untokenized'
             textParams['outputFormat'] = 'xmlDisplay'
             textParams['normMode'] = 'orig'
+            normMode = 'orig'
         elif 'gis' in modes:
             #FIXME!
 …
             # text is default mode
             textmode = 'plain'
-            textParams['mode'] = 'untokenized'
             textParams['outputFormat'] = 'html'
 …
                 # add textmode and normMode classes
                 pagediv.set('class', 'text %s %s'%(textmode, normMode))
                 #self._processWTags(textmode, normMode, pagediv)
+                self._processWTags(textmode, normMode, pagediv)
                 #self._processPbTag(pagediv, pageinfo)
                 self._processFigures(pagediv, docinfo)
 …
                         if linkurl.path.endswith('GetDictionaryEntries'):
                             #TODO: replace wordInfo page
-                            # is dictionary link - change href (keeping parameters)
-                            #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
                             # add target to open new page
                             l.set('target', '_blank')
-                        elif href.startswith('#note-'):
-                            # note link FIXME!
-                            l.set('href', href.replace('#note-',"%s#note-"%selfurl))
                 if punditMode:
                     self._addPunditAttributes(pagediv, pageinfo, docinfo)
+                return serialize(pagediv)
+                s = serialize(pagediv)
+                logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
+                return s
         # xml mode
 …
         """selects the necessary information from w-spans and removes the rest from pagediv"""
         logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode)))
+        startTime = datetime.now()
         wtags = pagediv.findall(".//span[@class='w']")
         for wtag in wtags:
-            text = None
-            attr = None
             if textMode == 'dict':
+                # take a-tag and matching child
+                attr = wtag.find('a').items()
+                text = wtag.find("a/span[@class='%s']"%normMode).text
+                # delete non-a-tags
+                wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                # delete non-matching children of a-tag and suppress remaining tag name
+                atag = wtag.find("a[@class='dictionary']")
+                if normMode == 'orig':
+                    atag.remove(atag.find("span[@class='reg']"))
+                    atag.remove(atag.find("span[@class='norm']"))
+                    atag.find("span[@class='orig']").tag = None
+                elif normMode == 'reg':
+                    atag.remove(atag.find("span[@class='orig']"))
+                    atag.remove(atag.find("span[@class='norm']"))
+                    atag.find("span[@class='reg']").tag = None
+                elif normMode == 'norm':
+                    atag.remove(atag.find("span[@class='orig']"))
+                    atag.remove(atag.find("span[@class='reg']"))
+                    atag.find("span[@class='norm']").tag = None
             else:
+                # take matching child
+                text = wtag.find("span[@class='nodictionary %s']"%normMode).text
+                # delete a-tag
+                wtag.remove(wtag.find("a[@class='dictionary']"))
+                # delete non-matching children and suppress remaining tag name
+                if normMode == 'orig':
+                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                    wtag.find("span[@class='nodictionary orig']").tag = None
+                elif normMode == 'reg':
+                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
+                    wtag.find("span[@class='nodictionary reg']").tag = None
+                elif normMode == 'norm':
+                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
+                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
+                    wtag.find("span[@class='nodictionary norm']").tag = None
+            if text:
+                # replace wtag by new content
+                logging.debug("new w-tag attr=%s text=%s"%(attr,text))
+                wtag.clear()
+                if attr:
+                    # make dictionary link
+                    wtag.tag = 'a'
+                    wtag.attrib.update(dict(attr))
+                # text content
+                wtag.text = text
+            # suppress w-tag name
+            wtag.tag = None
+        logging.debug("processWTags in %s"%(datetime.now()-startTime))
         return pagediv
 …
     def _processFigures(self, pagediv, docinfo):
         """processes figure-tags"""
+        divs = pagediv.findall(".//span[@class='figure']")
+        # unfortunately etree can not select class.startswith('figure')
+        divs = pagediv.findall(".//span[@class]")
         scalerUrl = docinfo['digilibScalerUrl']
         viewerUrl = docinfo['digilibViewerUrl']
         for d in divs:
+            if not d.get('class').startswith('figure'):
+                continue
             try:
                 a = d.find('a')
 …
             dom = ET.fromstring(pagexml)
             # page content is currently in multiple <td align=left>
             alldivs = dom.findall(".//td[@align='left']")
+            alldivs = dom.findall(".//tr[@class='hit']")
             for div in alldivs:
+                # change tr to div
+                div.tag = 'div'
+                # change td to span
+                for d in div.findall('td'):
+                    d.tag = 'span'
                 # TODO: can we put etree in the session?
                 results.append(div)
 …
             start = (pn - 1) * size
+        fullresult = ET.fromstring(resultxml)
+        if fullresult is not None:
+        #fullresult = ET.fromstring(resultxml)
+        #fullresult = resultxml
+        #logging.debug("resultxml=%s"%repr(resultxml))
+        if resultxml is not None:
             # paginate
             first = start-1
+            len = size
+            del fullresult[:first]
+            del fullresult[len:]
+            tocdivs = fullresult
+            # check all a-tags
+            links = tocdivs.findall(".//a")
+            for l in links:
+                href = l.get('href')
+                if href:
+                    # assume all links go to pages
+                    linkUrl = urlparse.urlparse(href)
+                    linkParams = urlparse.parse_qs(linkUrl.query)
+                    # take some parameters
+                    params = {'pn': linkParams['pn'],
+                              'highlightQuery': linkParams.get('highlightQuery',''),
+                              'highlightElement': linkParams.get('highlightElement',''),
+                              'highlightElementPos': linkParams.get('highlightElementPos','')
+                              }
+                    url = self.getLink(params=params)
+                    l.set('href', url)
+            last = first+size
+            tocdivs = resultxml[first:last]
+            #del fullresult[:first]
+            #del fullresult[len:]
+            #tocdivs = fullresult
+            toc = ET.Element('div', attrib={'class':'queryResultPage'})
+            for div in tocdivs:
+                # check all a-tags
+                links = div.findall(".//a")
+                for l in links:
+                    href = l.get('href')
+                    if href:
+                        # assume all links go to pages
+                        linkUrl = urlparse.urlparse(href)
+                        linkParams = urlparse.parse_qs(linkUrl.query)
+                        # take some parameters (make sure it works even if the link was already parsed)
+                        params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
+                                  'highlightQuery': linkParams.get('highlightQuery',None),
+                                  'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
+                                  'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
+                                  }
+                        if not params['pn']:
+                            logging.warn("getResultsPage: link has no page: %s"%href)
+                        url = self.getLink(params=params)
+                        l.set('href', url)
+            return serialize(tocdivs)
+                toc.append(div)
+            return serialize(toc)
         return "ERROR: no results!"

css/docuviewer.css

-                      r575
+                      r576
     margin-bottom: 0.25em;
+}
 /* normalization forms */
+/* normalization forms *
 div.col.main div.content.text div.text.orig span.w span.reg,
 div.col.main div.content.text div.text.orig span.w span.norm {
 …
     display: none;
+}
 /* dictionary forms */
+/* dictionary forms *
 div.col.main div.content.text div.text.plain span.w a.dictionary {
     display: none;
 …
 div.col.main div.content.text div.text.dict span.w span.nodictionary {
     display: none;
+}
+}
+*/
 /* page break */
 div.col.main div.content.text span.pb span.n,
 …
+}
 /* note */
 div.col.main div.content.text span.note {
+div.col.main div.content.text span.note span.noteBody {
         display: block;
         /* float: left; */
 …
+}
 div.col.main div.content.text span.note span.noteSign {
+        display: none;
+    display: none;
+    /* font-size: 70%;
+        vertical-align: super; */
+}
 /* figure */
 …
         margin-bottom: 0.5em;
+}
+div.col.results div.content div.hit {
+    margin-bottom: 0.5em;
+}
+div.col.results div.content div.hit span.hitLink {
+    margin-right: 0.5em;
+}
 /*
 …
         font-family: Monaco,Courier,monospace;
         font-size: 12px;
+}
+div.col.main div.content.xml ul {
+    padding-left: 1em;
+}
 div.col.main div.content.xml div.pageHeaderTitle {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 576:b2c7e272e075 in documentViewer

Legend:

MpiwgXmlTextServer.py

css/docuviewer.css

Download in other formats: