Changeset 610:0488cd12355b in documentViewer
- Timestamp:
- Jan 21, 2013, 6:58:21 PM (12 years ago)
- Branch:
- default
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
MpiwgXmlTextServer.py
r609 r610 13 13 14 14 from SrvTxtUtils import getInt, getText, getHttpData 15 16 # mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo 17 textinfoFieldMap = { 18 'countPages' : 'numTextPages', 19 'countFigures' : 'numFigureEntries', 20 'countNotesHandwritten' : 'numHandwritten', 21 'countNotes' : 'numNotes', 22 'countPlaces' : 'numPlaces', 23 'countTocEntries' : 'numTocEntries' 24 } 15 25 16 26 def serialize(node): … … 73 83 def getPlacesOnPage(self, docinfo=None, pn=None): 74 84 """Returns list of GIS places of page pn""" 75 #FIXME! 85 logging.debug("getPlacesOnPage(pn=%s"%pn) 86 if not 'places' in docinfo: 87 self.getTextInfo('places', docinfo) 88 89 allplaces = docinfo.get('places', None) 90 if len(allplaces) == 0: 91 return [] 92 93 # search for places on this page TODO: is there a better way? 94 places = [p for p in allplaces if p['pn'] == pn] 95 return places 96 """OLD: 76 97 docpath = docinfo.get('textURLPath',None) 77 98 if not docpath: … … 88 109 places.append(place) 89 110 90 return places 111 return places""" 91 112 92 113 … … 96 117 97 118 field = '' 98 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten' ]:119 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']: 99 120 # translate mode to field param 100 121 if mode == 'handwritten': … … 133 154 sys = doc.find('system') 134 155 if sys is not None: 135 docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) 136 docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) 137 docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten'))) 138 docinfo['numNotes'] = getInt(getText(sys.find('countNotes'))) 139 docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces'))) 140 docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) 156 for (k,v) in textinfoFieldMap.items(): 157 # copy into docinfo (even if empty) 158 docinfo[v] = getInt(getText(sys.find(k))) 141 159 142 160 else: … … 144 162 l = doc.find('list') 145 163 if l is not None: 164 # look for general info 165 for (k,v) in textinfoFieldMap.items(): 166 # copy into docinfo (only if not empty) 167 s = doc.find(k) 168 if s is not None: 169 docinfo[v] = getInt(getText(s)) 170 146 171 lt = l.get('type') 172 # 147 173 # pageNumbers 174 # 148 175 if lt == 'pages': 149 176 # contains tags with page numbers … … 165 192 166 193 docinfo['pageNumbers'] = pages 167 194 195 # 168 196 # toc 197 # 169 198 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: 170 199 # contains tags with table of contents/figures … … 186 215 docinfo['full_%s'%mode] = tocs 187 216 217 # 218 # places 219 # 220 # 221 # toc 222 # 223 elif lt in ['places']: 224 # contains tags with place-ids 225 # <item id="N40004F-01"><ref>4</ref></item> 226 places = [] 227 for p in l: 228 if p.tag == 'item': 229 place = {} 230 place['id'] = p.get('id') 231 ref = p.find('ref') 232 place['pn'] = getInt(ref.text) 233 places.append(place) 234 235 docinfo['places'] = places 236 188 237 return docinfo 189 238 … … 225 274 logging.debug("getTextPage: more than one mode=%s"%mode) 226 275 276 # mode defaults 277 gisMode = False 278 punditMode = False 279 227 280 # search mode 228 281 if 'search' in modes: … … 238 291 239 292 # pundit mode 240 punditMode = False241 293 if 'pundit' in modes: 242 294 punditMode = True … … 253 305 normMode = 'orig' 254 306 elif 'gis' in modes: 255 #FIXME! 256 textmode = 'gis' 307 gisMode = True 308 # gis mode uses plain text 309 textmode = 'plain' 310 textParams['outputFormat'] = 'html' 257 311 else: 258 312 # text is default mode … … 269 323 270 324 # plain text or text-with-links mode 271 if textmode == "plain" or textmode == "dict":325 if textmode == 'plain' or textmode == 'dict': 272 326 # the text is in div@class=text 273 327 pagediv = dom.find(".//div[@class='text']") … … 297 351 self._addPunditAttributes(pagediv, pageinfo, docinfo) 298 352 353 if gisMode: 354 self._addGisTags(pagediv, pageinfo, docinfo) 355 299 356 s = serialize(pagediv) 300 357 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) … … 309 366 return serialize(pagediv) 310 367 311 # pureXml mode WTF?312 elif textmode == "pureXml":313 # the text is in body314 pagediv = dom.find(".//body")315 logging.debug("pagediv: %s"%repr(pagediv))316 if pagediv is not None:317 return serialize(pagediv)318 319 # gis mode FIXME!320 elif textmode == "gis":321 # the text is in div@class=text322 pagediv = dom.find(".//div[@class='text']")323 logging.debug("pagediv: %s"%repr(pagediv))324 if pagediv is not None:325 # fix empty div tags326 self._fixEmptyDivs(pagediv)327 # check all a-tags328 links = pagediv.findall(".//a")329 # add our URL as backlink330 selfurl = self.getLink()331 doc = base64.b64encode(selfurl)332 for l in links:333 href = l.get('href')334 if href:335 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):336 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))337 l.set('target', '_blank')338 339 return serialize(pagediv)340 341 368 logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) 342 369 return None … … 409 436 410 437 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): 411 """add about attributes for pundit annotation tool"""438 """add about-attributes to divs for pundit annotation tool""" 412 439 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) 413 440 pn = pageinfo.get('pn', '1') 414 # TODO: use pn as well?415 441 # check all div-tags 416 442 divs = pagediv.findall(".//div") … … 423 449 cls += ' pundit-content' 424 450 d.set('class', cls.strip()) 451 452 return pagediv 453 454 def _addGisTags(self, pagediv, pageinfo, docinfo): 455 """add links for gis places""" 456 # use last part of documentPath as db-id 457 docpath = docinfo.get('documentPath', '') 458 textid = docpath.split('/')[-1] 459 # add our URL as backlink 460 selfurl = self.getLink() 461 doc = base64.b64encode(selfurl) 462 # check all span@class=place 463 spans = pagediv.findall(".//span[@class='place']") 464 for s in spans: 465 id = s.get('id') 466 if id: 467 # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis 468 s.tag = 'a' 469 # TODO: make links configurable 470 url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc) 471 s.set('href', url) 472 s.set('target', '_blank') 425 473 426 474 return pagediv -
documentViewer.py
r609 r610 206 206 207 207 def getTextDownloadUrl(self, **args): 208 """get list of gis places on one page"""208 """get URL to download the full text""" 209 209 return self.template.fulltextclient.getTextDownloadUrl(**args) 210 210
Note: See TracChangeset
for help on using the changeset viewer.