Changeset 565:1b483194901c in documentViewer
- Timestamp:
- Oct 9, 2012, 5:01:18 PM (12 years ago)
- Branch:
- default
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
MpiwgXmlTextServer.py
r564 r565 89 89 90 90 91 def getTextInfo(self, mode= '', docinfo=None):91 def getTextInfo(self, mode=None, docinfo=None): 92 92 """reads document info, including page concordance, from text server""" 93 93 logging.debug("getTextInfo mode=%s"%mode) 94 if mode not in ['toc', 'figures', '']: 95 mode = '' 94 95 field = '' 96 if mode in ['pages', 'toc', 'figures']: 97 # translate mode to field param 98 field = '&field=%s'%mode 99 else: 100 mode = None 101 96 102 # check cached info 97 103 if mode: … … 101 107 102 108 else: 103 # no toc-request109 # cached but no toc-request? 104 110 if 'numTextPages' in docinfo: 105 111 return docinfo … … 111 117 112 118 # fetch docinfo 113 pagexml = self.getServerData("query/GetDocInfo","docId=%s &field=%s"%(docpath,mode))119 pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) 114 120 dom = ET.fromstring(pagexml) 115 # all info in tag <doc ument>116 doc = dom .find("doc")121 # all info in tag <doc> 122 doc = dom 117 123 if doc is None: 118 124 logging.error("getTextInfo: unable to find document-tag!") 119 125 else: 120 # result is in list-tag 121 l = doc.find('list') 122 if l is not None: 123 lt = l.get('type') 124 # pageNumbers 125 if lt == 'pages': 126 # contains tags with page numbers 127 # <item n="14" o="2" o-norm="2" file="0014"/> 128 # n=scan number, o=original page no, on=normalized original page no 129 # pageNumbers is a dict indexed by scan number 130 pages = {} 131 for i in l: 132 page = {} 133 pn = getInt(i.get('n')) 134 page['pn'] = pn 135 no = getInt(i.get('o')) 136 page['no'] = no 137 non = getInt(i.get('o-norm')) 138 page['non'] = non 139 140 if pn > 0: 141 pages[pn] = page 126 if mode is None: 127 # get general info from system-tag 128 cp = doc.find('system/countPages') 129 if cp is not None: 130 docinfo['numTextPages'] = getInt(cp.text) 131 132 else: 133 # result is in list-tag 134 l = doc.find('list') 135 if l is not None: 136 lt = l.get('type') 137 # pageNumbers 138 if lt == 'pages': 139 # contains tags with page numbers 140 # <item n="14" o="2" o-norm="2" file="0014"/> 141 # n=scan number, o=original page no, on=normalized original page no 142 # pageNumbers is a dict indexed by scan number 143 pages = {} 144 for i in l: 145 page = {} 146 pn = getInt(i.get('n')) 147 page['pn'] = pn 148 no = getInt(i.get('o')) 149 page['no'] = no 150 non = getInt(i.get('o-norm')) 151 page['non'] = non 152 153 if pn > 0: 154 pages[pn] = page 155 156 docinfo['pageNumbers'] = pages 157 logging.debug("got pageNumbers=%s"%repr(pages)) 158 159 # toc 160 elif name == 'toc': 161 # contains tags with table of contents/figures 162 # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> 163 tocs = [] 164 for te in tag: 165 toc = {} 166 for t in te: 167 if t.tag == 'page': 168 toc['pn'] = getInt(t.text) 169 elif t.tag == 'level': 170 toc['level'] = t.text 171 elif t.tag == 'content': 172 toc['content'] = t.text 173 elif t.tag == 'level-string': 174 toc['level-string'] = t.text 175 elif t.tag == 'real-level': 176 toc['real-level'] = t.text 177 178 tocs.append(toc) 142 179 143 docinfo['numTextPages'] = len(pages) 144 docinfo['pageNumbers'] = pages 145 logging.debug("got pageNumbers=%s"%repr(pages)) 146 147 # toc 148 elif name == 'toc': 149 # contains tags with table of contents/figures 150 # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> 151 tocs = [] 152 for te in tag: 153 toc = {} 154 for t in te: 155 if t.tag == 'page': 156 toc['pn'] = getInt(t.text) 157 elif t.tag == 'level': 158 toc['level'] = t.text 159 elif t.tag == 'content': 160 toc['content'] = t.text 161 elif t.tag == 'level-string': 162 toc['level-string'] = t.text 163 elif t.tag == 'real-level': 164 toc['real-level'] = t.text 165 166 tocs.append(toc) 167 168 # save as full_toc/full_figures 169 docinfo['full_%s'%mode] = tocs 180 # save as full_toc/full_figures 181 docinfo['full_%s'%mode] = tocs 170 182 171 183 return docinfo … … 221 233 textParams = {'docId': docpath, 222 234 'page': pn} 235 223 236 if 'characterNormalization' in pageinfo: 224 textParams['normalization'] = pageinfo['characterNormalization'] 237 cn = pageinfo['characterNormalization'] 238 # TODO: change values in form 239 if cn == 'regPlusNorm': 240 cn = 'norm' 241 242 textParams['normalization'] = cn 225 243 226 244 if not mode: … … 273 291 # fetch the page 274 292 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) 275 dom = ET.fromstring(pagexml) 276 # extract additional info 277 #self.processPageInfo(dom, docinfo, pageinfo) 278 # page content is in <div class="pageContent"> 293 try: 294 dom = ET.fromstring(pagexml) 295 except Exception, e: 296 logging.error("Error parsing page: %s"%e) 297 return None 298 279 299 pagediv = None 280 300 body = dom.find('.//body') … … 283 303 return None 284 304 285 # currently there's lots of divs... 286 textspan = body.find('span/span') 287 divs = textspan.findall('div') 288 logging.debug("textdivs: %s"%repr(divs)) 289 pagediv = divs[0] 290 logging.debug("pagediv: %s"%serialize(pagediv)) 305 # the text is in div@class=text 306 pagediv = body.find(".//div[@class='text']") 307 logging.debug("pagediv: %s"%repr(pagediv)) 291 308 292 309 # plain text mode 293 310 if textmode == "text": 294 # get full url assuming documentViewer is parent295 selfurl = self.getLink()296 311 if pagediv is not None: 312 # handle pb-tag 313 self._extractPbTag(pagediv, pageinfo) 314 # get full url assuming documentViewer is parent 315 selfurl = self.getLink() 297 316 if punditMode: 298 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)317 self._addPunditAttributes(pagediv, pageinfo, docinfo) 299 318 300 319 # fix empty div tags 301 divs = pagediv.findall('.//div') 302 for d in divs: 303 if len(d) == 0 and not d.text: 304 # make empty divs non-empty 305 d.text = ' ' 306 320 self._fixEmptyDivs(pagediv) 307 321 # check all a-tags 308 322 links = pagediv.findall('.//a') 309 323 for l in links: 310 324 href = l.get('href') 325 # handle notes FIXME! 311 326 if href and href.startswith('#note-'): 312 327 href = href.replace('#note-',"%s#note-"%selfurl) 313 328 l.set('href', href) 314 329 315 330 return serialize(pagediv) 316 331 … … 318 333 elif textmode == "dict": 319 334 if pagediv is not None: 335 # handle pb-div 336 self._extractPbTag(pagediv, pageinfo) 320 337 viewerurl = docinfo['viewerUrl'] 321 338 selfurl = self.getLink() … … 324 341 325 342 # fix empty div tags 326 divs = pagediv.findall('.//div') 327 for d in divs: 328 if len(d) == 0 and not d.text: 329 # make empty divs non-empty 330 d.text = ' ' 331 343 self._fixEmptyDivs(pagediv) 332 344 # check all a-tags 333 345 links = pagediv.findall(".//a") 334 346 for l in links: 335 347 href = l.get('href') 336 337 348 if href: 338 349 # is link with href … … 366 377 if pagediv is not None: 367 378 # fix empty div tags 368 divs = pagediv.findall('.//div') 369 for d in divs: 370 if len(d) == 0 and not d.text: 371 # make empty divs non-empty 372 d.text = ' ' 373 379 self._fixEmptyDivs(pagediv) 374 380 # check all a-tags 375 381 links = pagediv.findall(".//a") … … 387 393 388 394 return None 395 396 def _extractPbTag(self, pagediv, pageinfo): 397 """extracts information from pb-tag and removes it from pagediv""" 398 pbdiv = pagediv.find(".//span[@class='pb']") 399 if pbdiv is None: 400 logging.warning("getTextPage: no pb-span!") 401 return pagediv 402 403 # extract running head 404 rh = pbdiv.find(".//span[@class='rhead']") 405 if rh is not None: 406 pageinfo['pageHeaderTitle'] = getText(rh) 407 408 # remove pb-div from parent 409 ppdiv = pagediv.find(".//span[@class='pb']/..") 410 ppdiv.remove(pbdiv) 411 return pagediv 389 412 390 def addPunditAttributes(self, pagediv, pageinfo, docinfo):413 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): 391 414 """add about attributes for pundit annotation tool""" 392 415 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) … … 404 427 405 428 return pagediv 429 430 def _fixEmptyDivs(self, pagediv): 431 """fixes empty div-tags by inserting a space""" 432 divs = pagediv.findall('.//div') 433 for d in divs: 434 if len(d) == 0 and not d.text: 435 # make empty divs non-empty 436 d.text = ' ' 437 438 return pagediv 439 406 440 407 441 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): -
css/docuviewer.css
r543 r565 257 257 font-size: 12px; 258 258 } 259 div.col.main div.content.text .bf { 260 font-weight: bold; 261 } 262 div.col.main div.content.text .head { 263 margin-top: 0.5em; 264 margin-bottom: 0.25em; 265 } 266 /* running head */ 267 div.col.main div.content.text div.pageHeaderTitle { 268 text-align: center; 269 margin-bottom: 1em; 270 } 271 /* figures */ 272 div.col.main div.content.text span.figure { 273 display: block; 274 width: 200px; 275 margin-top: 0.5em; 276 margin-bottom: 0.5em; 277 padding: 5px; 278 border: 1px dashed silver; 279 /* float: right; */ 280 /* text-align: center; */ 281 } 282 div.col.main div.content.text span.figure>a, 283 div.col.main div.content.text span.figure span.figureNumber, 284 div.col.main div.content.text span.figure span.caption, 285 div.col.main div.content.text span.figure span.description { 286 display:block; 287 } 288 div.col.main div.content.text span.figure span.figureNum { 289 display: none; 290 } 259 291 /* 260 292 * search results -
documentViewer.py
r564 r565 587 587 # document info (including toc) from full text 588 588 if docinfo.get('textURLPath', None): 589 docinfo = self.getTextInfo(mode= 'pages', docinfo=docinfo)589 docinfo = self.getTextInfo(mode=None, docinfo=docinfo) 590 590 591 591 # bib info
Note: See TracChangeset
for help on using the changeset viewer.