comparison MpiwgXmlTextServer.py @ 610:0488cd12355b

gis mode works again.
author casties
date Mon, 21 Jan 2013 19:58:21 +0100
parents 7962e6891d99
children c57d80a649ea
comparison
equal deleted inserted replaced
609:7962e6891d99 610:0488cd12355b
10 import base64 10 import base64
11 11
12 from datetime import datetime 12 from datetime import datetime
13 13
14 from SrvTxtUtils import getInt, getText, getHttpData 14 from SrvTxtUtils import getInt, getText, getHttpData
15
16 # mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo
17 textinfoFieldMap = {
18 'countPages' : 'numTextPages',
19 'countFigures' : 'numFigureEntries',
20 'countNotesHandwritten' : 'numHandwritten',
21 'countNotes' : 'numNotes',
22 'countPlaces' : 'numPlaces',
23 'countTocEntries' : 'numTocEntries'
24 }
15 25
16 def serialize(node): 26 def serialize(node):
17 """returns a string containing an XML snippet of node""" 27 """returns a string containing an XML snippet of node"""
18 s = ET.tostring(node, 'UTF-8') 28 s = ET.tostring(node, 'UTF-8')
19 # snip off XML declaration 29 # snip off XML declaration
70 return url 80 return url
71 81
72 82
73 def getPlacesOnPage(self, docinfo=None, pn=None): 83 def getPlacesOnPage(self, docinfo=None, pn=None):
74 """Returns list of GIS places of page pn""" 84 """Returns list of GIS places of page pn"""
75 #FIXME! 85 logging.debug("getPlacesOnPage(pn=%s"%pn)
86 if not 'places' in docinfo:
87 self.getTextInfo('places', docinfo)
88
89 allplaces = docinfo.get('places', None)
90 if len(allplaces) == 0:
91 return []
92
93 # search for places on this page TODO: is there a better way?
94 places = [p for p in allplaces if p['pn'] == pn]
95 return places
96 """OLD:
76 docpath = docinfo.get('textURLPath',None) 97 docpath = docinfo.get('textURLPath',None)
77 if not docpath: 98 if not docpath:
78 return None 99 return None
79 100
80 places=[] 101 places=[]
85 id = l.get("id") 106 id = l.get("id")
86 name = l.text 107 name = l.text
87 place = {'id': id, 'name': name} 108 place = {'id': id, 'name': name}
88 places.append(place) 109 places.append(place)
89 110
90 return places 111 return places"""
91 112
92 113
93 def getTextInfo(self, mode=None, docinfo=None): 114 def getTextInfo(self, mode=None, docinfo=None):
94 """reads document info, including page concordance, from text server""" 115 """reads document info, including page concordance, from text server"""
95 logging.debug("getTextInfo mode=%s"%mode) 116 logging.debug("getTextInfo mode=%s"%mode)
96 117
97 field = '' 118 field = ''
98 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten']: 119 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']:
99 # translate mode to field param 120 # translate mode to field param
100 if mode == 'handwritten': 121 if mode == 'handwritten':
101 field = '&field=notesHandwritten' 122 field = '&field=notesHandwritten'
102 else: 123 else:
103 field = '&field=%s'%mode 124 field = '&field=%s'%mode
130 else: 151 else:
131 if mode is None: 152 if mode is None:
132 # get general info from system-tag 153 # get general info from system-tag
133 sys = doc.find('system') 154 sys = doc.find('system')
134 if sys is not None: 155 if sys is not None:
135 docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) 156 for (k,v) in textinfoFieldMap.items():
136 docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) 157 # copy into docinfo (even if empty)
137 docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten'))) 158 docinfo[v] = getInt(getText(sys.find(k)))
138 docinfo['numNotes'] = getInt(getText(sys.find('countNotes')))
139 docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces')))
140 docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries')))
141 159
142 else: 160 else:
143 # result is in list-tag 161 # result is in list-tag
144 l = doc.find('list') 162 l = doc.find('list')
145 if l is not None: 163 if l is not None:
164 # look for general info
165 for (k,v) in textinfoFieldMap.items():
166 # copy into docinfo (only if not empty)
167 s = doc.find(k)
168 if s is not None:
169 docinfo[v] = getInt(getText(s))
170
146 lt = l.get('type') 171 lt = l.get('type')
172 #
147 # pageNumbers 173 # pageNumbers
174 #
148 if lt == 'pages': 175 if lt == 'pages':
149 # contains tags with page numbers 176 # contains tags with page numbers
150 # <item n="14" o="2" o-norm="2" file="0014"/> 177 # <item n="14" o="2" o-norm="2" file="0014"/>
151 # n=scan number, o=original page no, on=normalized original page no 178 # n=scan number, o=original page no, on=normalized original page no
152 # pageNumbers is a dict indexed by scan number 179 # pageNumbers is a dict indexed by scan number
162 189
163 if pn > 0: 190 if pn > 0:
164 pages[pn] = page 191 pages[pn] = page
165 192
166 docinfo['pageNumbers'] = pages 193 docinfo['pageNumbers'] = pages
167 194
195 #
168 # toc 196 # toc
197 #
169 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: 198 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']:
170 # contains tags with table of contents/figures 199 # contains tags with table of contents/figures
171 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> 200 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
172 tocs = [] 201 tocs = []
173 for te in l: 202 for te in l:
183 tocs.append(toc) 212 tocs.append(toc)
184 213
185 # save as full_toc/full_figures 214 # save as full_toc/full_figures
186 docinfo['full_%s'%mode] = tocs 215 docinfo['full_%s'%mode] = tocs
187 216
217 #
218 # places
219 #
220 #
221 # toc
222 #
223 elif lt in ['places']:
224 # contains tags with place-ids
225 # <item id="N40004F-01"><ref>4</ref></item>
226 places = []
227 for p in l:
228 if p.tag == 'item':
229 place = {}
230 place['id'] = p.get('id')
231 ref = p.find('ref')
232 place['pn'] = getInt(ref.text)
233 places.append(place)
234
235 docinfo['places'] = places
236
188 return docinfo 237 return docinfo
189 238
190 239
191 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): 240 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
192 """returns single page from fulltext""" 241 """returns single page from fulltext"""
222 modes = mode.split(',') 271 modes = mode.split(',')
223 # check for multiple layers 272 # check for multiple layers
224 if len(modes) > 1: 273 if len(modes) > 1:
225 logging.debug("getTextPage: more than one mode=%s"%mode) 274 logging.debug("getTextPage: more than one mode=%s"%mode)
226 275
276 # mode defaults
277 gisMode = False
278 punditMode = False
279
227 # search mode 280 # search mode
228 if 'search' in modes: 281 if 'search' in modes:
229 # add highlighting 282 # add highlighting
230 highlightQuery = pageinfo.get('highlightQuery', None) 283 highlightQuery = pageinfo.get('highlightQuery', None)
231 if highlightQuery: 284 if highlightQuery:
235 288
236 # ignore mode in the following 289 # ignore mode in the following
237 modes.remove('search') 290 modes.remove('search')
238 291
239 # pundit mode 292 # pundit mode
240 punditMode = False
241 if 'pundit' in modes: 293 if 'pundit' in modes:
242 punditMode = True 294 punditMode = True
243 # ignore mode in the following 295 # ignore mode in the following
244 modes.remove('pundit') 296 modes.remove('pundit')
245 297
250 elif 'xml' in modes: 302 elif 'xml' in modes:
251 textmode = 'xml' 303 textmode = 'xml'
252 textParams['outputFormat'] = 'xmlDisplay' 304 textParams['outputFormat'] = 'xmlDisplay'
253 normMode = 'orig' 305 normMode = 'orig'
254 elif 'gis' in modes: 306 elif 'gis' in modes:
255 #FIXME! 307 gisMode = True
256 textmode = 'gis' 308 # gis mode uses plain text
309 textmode = 'plain'
310 textParams['outputFormat'] = 'html'
257 else: 311 else:
258 # text is default mode 312 # text is default mode
259 textmode = 'plain' 313 textmode = 'plain'
260 textParams['outputFormat'] = 'html' 314 textParams['outputFormat'] = 'html'
261 315
266 except Exception, e: 320 except Exception, e:
267 logging.error("Error reading page: %s"%e) 321 logging.error("Error reading page: %s"%e)
268 return None 322 return None
269 323
270 # plain text or text-with-links mode 324 # plain text or text-with-links mode
271 if textmode == "plain" or textmode == "dict": 325 if textmode == 'plain' or textmode == 'dict':
272 # the text is in div@class=text 326 # the text is in div@class=text
273 pagediv = dom.find(".//div[@class='text']") 327 pagediv = dom.find(".//div[@class='text']")
274 logging.debug("pagediv: %s"%repr(pagediv)) 328 logging.debug("pagediv: %s"%repr(pagediv))
275 if pagediv is not None: 329 if pagediv is not None:
276 # add textmode and normMode classes 330 # add textmode and normMode classes
294 l.set('target', '_blank') 348 l.set('target', '_blank')
295 349
296 if punditMode: 350 if punditMode:
297 self._addPunditAttributes(pagediv, pageinfo, docinfo) 351 self._addPunditAttributes(pagediv, pageinfo, docinfo)
298 352
353 if gisMode:
354 self._addGisTags(pagediv, pageinfo, docinfo)
355
299 s = serialize(pagediv) 356 s = serialize(pagediv)
300 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) 357 logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
301 return s 358 return s
302 359
303 # xml mode 360 # xml mode
306 pagediv = dom.find(".//body") 363 pagediv = dom.find(".//body")
307 logging.debug("pagediv: %s"%repr(pagediv)) 364 logging.debug("pagediv: %s"%repr(pagediv))
308 if pagediv is not None: 365 if pagediv is not None:
309 return serialize(pagediv) 366 return serialize(pagediv)
310 367
311 # pureXml mode WTF?
312 elif textmode == "pureXml":
313 # the text is in body
314 pagediv = dom.find(".//body")
315 logging.debug("pagediv: %s"%repr(pagediv))
316 if pagediv is not None:
317 return serialize(pagediv)
318
319 # gis mode FIXME!
320 elif textmode == "gis":
321 # the text is in div@class=text
322 pagediv = dom.find(".//div[@class='text']")
323 logging.debug("pagediv: %s"%repr(pagediv))
324 if pagediv is not None:
325 # fix empty div tags
326 self._fixEmptyDivs(pagediv)
327 # check all a-tags
328 links = pagediv.findall(".//a")
329 # add our URL as backlink
330 selfurl = self.getLink()
331 doc = base64.b64encode(selfurl)
332 for l in links:
333 href = l.get('href')
334 if href:
335 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
336 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
337 l.set('target', '_blank')
338
339 return serialize(pagediv)
340
341 logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) 368 logging.error("getTextPage: error in text mode %s or in text!"%(textmode))
342 return None 369 return None
343 370
344 def _processWTags(self, textMode, normMode, pagediv): 371 def _processWTags(self, textMode, normMode, pagediv):
345 """selects the necessary information from w-spans and removes the rest from pagediv""" 372 """selects the necessary information from w-spans and removes the rest from pagediv"""
406 ppdiv = pagediv.find(".//span[@class='pb']/..") 433 ppdiv = pagediv.find(".//span[@class='pb']/..")
407 ppdiv.remove(pbdiv) 434 ppdiv.remove(pbdiv)
408 return pagediv 435 return pagediv
409 436
410 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): 437 def _addPunditAttributes(self, pagediv, pageinfo, docinfo):
411 """add about attributes for pundit annotation tool""" 438 """add about-attributes to divs for pundit annotation tool"""
412 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) 439 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
413 pn = pageinfo.get('pn', '1') 440 pn = pageinfo.get('pn', '1')
414 # TODO: use pn as well?
415 # check all div-tags 441 # check all div-tags
416 divs = pagediv.findall(".//div") 442 divs = pagediv.findall(".//div")
417 for d in divs: 443 for d in divs:
418 id = d.get('id') 444 id = d.get('id')
419 if id: 445 if id:
420 # TODO: check path (cf RFC2396) 446 # TODO: check path (cf RFC2396)
421 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) 447 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
422 cls = d.get('class','') 448 cls = d.get('class','')
423 cls += ' pundit-content' 449 cls += ' pundit-content'
424 d.set('class', cls.strip()) 450 d.set('class', cls.strip())
451
452 return pagediv
453
454 def _addGisTags(self, pagediv, pageinfo, docinfo):
455 """add links for gis places"""
456 # use last part of documentPath as db-id
457 docpath = docinfo.get('documentPath', '')
458 textid = docpath.split('/')[-1]
459 # add our URL as backlink
460 selfurl = self.getLink()
461 doc = base64.b64encode(selfurl)
462 # check all span@class=place
463 spans = pagediv.findall(".//span[@class='place']")
464 for s in spans:
465 id = s.get('id')
466 if id:
467 # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis
468 s.tag = 'a'
469 # TODO: make links configurable
470 url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc)
471 s.set('href', url)
472 s.set('target', '_blank')
425 473
426 return pagediv 474 return pagediv
427 475
428 def _processFigures(self, pagediv, docinfo): 476 def _processFigures(self, pagediv, docinfo):
429 """processes figure-tags""" 477 """processes figure-tags"""