Changeset 565:1b483194901c in documentViewer for MpiwgXmlTextServer.py
- Timestamp:
- Oct 9, 2012, 5:01:18 PM (12 years ago)
- Branch:
- default
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
MpiwgXmlTextServer.py
r564 r565 89 89 90 90 91 def getTextInfo(self, mode= '', docinfo=None):91 def getTextInfo(self, mode=None, docinfo=None): 92 92 """reads document info, including page concordance, from text server""" 93 93 logging.debug("getTextInfo mode=%s"%mode) 94 if mode not in ['toc', 'figures', '']: 95 mode = '' 94 95 field = '' 96 if mode in ['pages', 'toc', 'figures']: 97 # translate mode to field param 98 field = '&field=%s'%mode 99 else: 100 mode = None 101 96 102 # check cached info 97 103 if mode: … … 101 107 102 108 else: 103 # no toc-request109 # cached but no toc-request? 104 110 if 'numTextPages' in docinfo: 105 111 return docinfo … … 111 117 112 118 # fetch docinfo 113 pagexml = self.getServerData("query/GetDocInfo","docId=%s &field=%s"%(docpath,mode))119 pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) 114 120 dom = ET.fromstring(pagexml) 115 # all info in tag <doc ument>116 doc = dom .find("doc")121 # all info in tag <doc> 122 doc = dom 117 123 if doc is None: 118 124 logging.error("getTextInfo: unable to find document-tag!") 119 125 else: 120 # result is in list-tag 121 l = doc.find('list') 122 if l is not None: 123 lt = l.get('type') 124 # pageNumbers 125 if lt == 'pages': 126 # contains tags with page numbers 127 # <item n="14" o="2" o-norm="2" file="0014"/> 128 # n=scan number, o=original page no, on=normalized original page no 129 # pageNumbers is a dict indexed by scan number 130 pages = {} 131 for i in l: 132 page = {} 133 pn = getInt(i.get('n')) 134 page['pn'] = pn 135 no = getInt(i.get('o')) 136 page['no'] = no 137 non = getInt(i.get('o-norm')) 138 page['non'] = non 139 140 if pn > 0: 141 pages[pn] = page 126 if mode is None: 127 # get general info from system-tag 128 cp = doc.find('system/countPages') 129 if cp is not None: 130 docinfo['numTextPages'] = getInt(cp.text) 131 132 else: 133 # result is in list-tag 134 l = doc.find('list') 135 if l is not None: 136 lt = l.get('type') 137 # pageNumbers 138 if lt == 'pages': 139 # contains tags with page numbers 140 # <item n="14" o="2" o-norm="2" file="0014"/> 141 # n=scan number, o=original page no, on=normalized original page no 142 # pageNumbers is a dict indexed by scan number 143 pages = {} 144 for i in l: 145 page = {} 146 pn = getInt(i.get('n')) 147 page['pn'] = pn 148 no = getInt(i.get('o')) 149 page['no'] = no 150 non = getInt(i.get('o-norm')) 151 page['non'] = non 152 153 if pn > 0: 154 pages[pn] = page 155 156 docinfo['pageNumbers'] = pages 157 logging.debug("got pageNumbers=%s"%repr(pages)) 158 159 # toc 160 elif name == 'toc': 161 # contains tags with table of contents/figures 162 # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> 163 tocs = [] 164 for te in tag: 165 toc = {} 166 for t in te: 167 if t.tag == 'page': 168 toc['pn'] = getInt(t.text) 169 elif t.tag == 'level': 170 toc['level'] = t.text 171 elif t.tag == 'content': 172 toc['content'] = t.text 173 elif t.tag == 'level-string': 174 toc['level-string'] = t.text 175 elif t.tag == 'real-level': 176 toc['real-level'] = t.text 177 178 tocs.append(toc) 142 179 143 docinfo['numTextPages'] = len(pages) 144 docinfo['pageNumbers'] = pages 145 logging.debug("got pageNumbers=%s"%repr(pages)) 146 147 # toc 148 elif name == 'toc': 149 # contains tags with table of contents/figures 150 # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> 151 tocs = [] 152 for te in tag: 153 toc = {} 154 for t in te: 155 if t.tag == 'page': 156 toc['pn'] = getInt(t.text) 157 elif t.tag == 'level': 158 toc['level'] = t.text 159 elif t.tag == 'content': 160 toc['content'] = t.text 161 elif t.tag == 'level-string': 162 toc['level-string'] = t.text 163 elif t.tag == 'real-level': 164 toc['real-level'] = t.text 165 166 tocs.append(toc) 167 168 # save as full_toc/full_figures 169 docinfo['full_%s'%mode] = tocs 180 # save as full_toc/full_figures 181 docinfo['full_%s'%mode] = tocs 170 182 171 183 return docinfo … … 221 233 textParams = {'docId': docpath, 222 234 'page': pn} 235 223 236 if 'characterNormalization' in pageinfo: 224 textParams['normalization'] = pageinfo['characterNormalization'] 237 cn = pageinfo['characterNormalization'] 238 # TODO: change values in form 239 if cn == 'regPlusNorm': 240 cn = 'norm' 241 242 textParams['normalization'] = cn 225 243 226 244 if not mode: … … 273 291 # fetch the page 274 292 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) 275 dom = ET.fromstring(pagexml) 276 # extract additional info 277 #self.processPageInfo(dom, docinfo, pageinfo) 278 # page content is in <div class="pageContent"> 293 try: 294 dom = ET.fromstring(pagexml) 295 except Exception, e: 296 logging.error("Error parsing page: %s"%e) 297 return None 298 279 299 pagediv = None 280 300 body = dom.find('.//body') … … 283 303 return None 284 304 285 # currently there's lots of divs... 286 textspan = body.find('span/span') 287 divs = textspan.findall('div') 288 logging.debug("textdivs: %s"%repr(divs)) 289 pagediv = divs[0] 290 logging.debug("pagediv: %s"%serialize(pagediv)) 305 # the text is in div@class=text 306 pagediv = body.find(".//div[@class='text']") 307 logging.debug("pagediv: %s"%repr(pagediv)) 291 308 292 309 # plain text mode 293 310 if textmode == "text": 294 # get full url assuming documentViewer is parent295 selfurl = self.getLink()296 311 if pagediv is not None: 312 # handle pb-tag 313 self._extractPbTag(pagediv, pageinfo) 314 # get full url assuming documentViewer is parent 315 selfurl = self.getLink() 297 316 if punditMode: 298 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)317 self._addPunditAttributes(pagediv, pageinfo, docinfo) 299 318 300 319 # fix empty div tags 301 divs = pagediv.findall('.//div') 302 for d in divs: 303 if len(d) == 0 and not d.text: 304 # make empty divs non-empty 305 d.text = ' ' 306 320 self._fixEmptyDivs(pagediv) 307 321 # check all a-tags 308 322 links = pagediv.findall('.//a') 309 323 for l in links: 310 324 href = l.get('href') 325 # handle notes FIXME! 311 326 if href and href.startswith('#note-'): 312 327 href = href.replace('#note-',"%s#note-"%selfurl) 313 328 l.set('href', href) 314 329 315 330 return serialize(pagediv) 316 331 … … 318 333 elif textmode == "dict": 319 334 if pagediv is not None: 335 # handle pb-div 336 self._extractPbTag(pagediv, pageinfo) 320 337 viewerurl = docinfo['viewerUrl'] 321 338 selfurl = self.getLink() … … 324 341 325 342 # fix empty div tags 326 divs = pagediv.findall('.//div') 327 for d in divs: 328 if len(d) == 0 and not d.text: 329 # make empty divs non-empty 330 d.text = ' ' 331 343 self._fixEmptyDivs(pagediv) 332 344 # check all a-tags 333 345 links = pagediv.findall(".//a") 334 346 for l in links: 335 347 href = l.get('href') 336 337 348 if href: 338 349 # is link with href … … 366 377 if pagediv is not None: 367 378 # fix empty div tags 368 divs = pagediv.findall('.//div') 369 for d in divs: 370 if len(d) == 0 and not d.text: 371 # make empty divs non-empty 372 d.text = ' ' 373 379 self._fixEmptyDivs(pagediv) 374 380 # check all a-tags 375 381 links = pagediv.findall(".//a") … … 387 393 388 394 return None 395 396 def _extractPbTag(self, pagediv, pageinfo): 397 """extracts information from pb-tag and removes it from pagediv""" 398 pbdiv = pagediv.find(".//span[@class='pb']") 399 if pbdiv is None: 400 logging.warning("getTextPage: no pb-span!") 401 return pagediv 402 403 # extract running head 404 rh = pbdiv.find(".//span[@class='rhead']") 405 if rh is not None: 406 pageinfo['pageHeaderTitle'] = getText(rh) 407 408 # remove pb-div from parent 409 ppdiv = pagediv.find(".//span[@class='pb']/..") 410 ppdiv.remove(pbdiv) 411 return pagediv 389 412 390 def addPunditAttributes(self, pagediv, pageinfo, docinfo):413 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): 391 414 """add about attributes for pundit annotation tool""" 392 415 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) … … 404 427 405 428 return pagediv 429 430 def _fixEmptyDivs(self, pagediv): 431 """fixes empty div-tags by inserting a space""" 432 divs = pagediv.findall('.//div') 433 for d in divs: 434 if len(d) == 0 and not d.text: 435 # make empty divs non-empty 436 d.text = ' ' 437 438 return pagediv 439 406 440 407 441 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
Note: See TracChangeset
for help on using the changeset viewer.