Mercurial > hg > documentViewer
comparison MpiwgXmlTextServer.py @ 610:0488cd12355b
gis mode works again.
author | casties |
---|---|
date | Mon, 21 Jan 2013 19:58:21 +0100 |
parents | 7962e6891d99 |
children | c57d80a649ea |
comparison
equal
deleted
inserted
replaced
609:7962e6891d99 | 610:0488cd12355b |
---|---|
10 import base64 | 10 import base64 |
11 | 11 |
12 from datetime import datetime | 12 from datetime import datetime |
13 | 13 |
14 from SrvTxtUtils import getInt, getText, getHttpData | 14 from SrvTxtUtils import getInt, getText, getHttpData |
15 | |
16 # mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo | |
17 textinfoFieldMap = { | |
18 'countPages' : 'numTextPages', | |
19 'countFigures' : 'numFigureEntries', | |
20 'countNotesHandwritten' : 'numHandwritten', | |
21 'countNotes' : 'numNotes', | |
22 'countPlaces' : 'numPlaces', | |
23 'countTocEntries' : 'numTocEntries' | |
24 } | |
15 | 25 |
16 def serialize(node): | 26 def serialize(node): |
17 """returns a string containing an XML snippet of node""" | 27 """returns a string containing an XML snippet of node""" |
18 s = ET.tostring(node, 'UTF-8') | 28 s = ET.tostring(node, 'UTF-8') |
19 # snip off XML declaration | 29 # snip off XML declaration |
70 return url | 80 return url |
71 | 81 |
72 | 82 |
73 def getPlacesOnPage(self, docinfo=None, pn=None): | 83 def getPlacesOnPage(self, docinfo=None, pn=None): |
74 """Returns list of GIS places of page pn""" | 84 """Returns list of GIS places of page pn""" |
75 #FIXME! | 85 logging.debug("getPlacesOnPage(pn=%s"%pn) |
86 if not 'places' in docinfo: | |
87 self.getTextInfo('places', docinfo) | |
88 | |
89 allplaces = docinfo.get('places', None) | |
90 if len(allplaces) == 0: | |
91 return [] | |
92 | |
93 # search for places on this page TODO: is there a better way? | |
94 places = [p for p in allplaces if p['pn'] == pn] | |
95 return places | |
96 """OLD: | |
76 docpath = docinfo.get('textURLPath',None) | 97 docpath = docinfo.get('textURLPath',None) |
77 if not docpath: | 98 if not docpath: |
78 return None | 99 return None |
79 | 100 |
80 places=[] | 101 places=[] |
85 id = l.get("id") | 106 id = l.get("id") |
86 name = l.text | 107 name = l.text |
87 place = {'id': id, 'name': name} | 108 place = {'id': id, 'name': name} |
88 places.append(place) | 109 places.append(place) |
89 | 110 |
90 return places | 111 return places""" |
91 | 112 |
92 | 113 |
93 def getTextInfo(self, mode=None, docinfo=None): | 114 def getTextInfo(self, mode=None, docinfo=None): |
94 """reads document info, including page concordance, from text server""" | 115 """reads document info, including page concordance, from text server""" |
95 logging.debug("getTextInfo mode=%s"%mode) | 116 logging.debug("getTextInfo mode=%s"%mode) |
96 | 117 |
97 field = '' | 118 field = '' |
98 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten']: | 119 if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']: |
99 # translate mode to field param | 120 # translate mode to field param |
100 if mode == 'handwritten': | 121 if mode == 'handwritten': |
101 field = '&field=notesHandwritten' | 122 field = '&field=notesHandwritten' |
102 else: | 123 else: |
103 field = '&field=%s'%mode | 124 field = '&field=%s'%mode |
130 else: | 151 else: |
131 if mode is None: | 152 if mode is None: |
132 # get general info from system-tag | 153 # get general info from system-tag |
133 sys = doc.find('system') | 154 sys = doc.find('system') |
134 if sys is not None: | 155 if sys is not None: |
135 docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) | 156 for (k,v) in textinfoFieldMap.items(): |
136 docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) | 157 # copy into docinfo (even if empty) |
137 docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten'))) | 158 docinfo[v] = getInt(getText(sys.find(k))) |
138 docinfo['numNotes'] = getInt(getText(sys.find('countNotes'))) | |
139 docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces'))) | |
140 docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) | |
141 | 159 |
142 else: | 160 else: |
143 # result is in list-tag | 161 # result is in list-tag |
144 l = doc.find('list') | 162 l = doc.find('list') |
145 if l is not None: | 163 if l is not None: |
164 # look for general info | |
165 for (k,v) in textinfoFieldMap.items(): | |
166 # copy into docinfo (only if not empty) | |
167 s = doc.find(k) | |
168 if s is not None: | |
169 docinfo[v] = getInt(getText(s)) | |
170 | |
146 lt = l.get('type') | 171 lt = l.get('type') |
172 # | |
147 # pageNumbers | 173 # pageNumbers |
174 # | |
148 if lt == 'pages': | 175 if lt == 'pages': |
149 # contains tags with page numbers | 176 # contains tags with page numbers |
150 # <item n="14" o="2" o-norm="2" file="0014"/> | 177 # <item n="14" o="2" o-norm="2" file="0014"/> |
151 # n=scan number, o=original page no, on=normalized original page no | 178 # n=scan number, o=original page no, on=normalized original page no |
152 # pageNumbers is a dict indexed by scan number | 179 # pageNumbers is a dict indexed by scan number |
162 | 189 |
163 if pn > 0: | 190 if pn > 0: |
164 pages[pn] = page | 191 pages[pn] = page |
165 | 192 |
166 docinfo['pageNumbers'] = pages | 193 docinfo['pageNumbers'] = pages |
167 | 194 |
195 # | |
168 # toc | 196 # toc |
197 # | |
169 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: | 198 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: |
170 # contains tags with table of contents/figures | 199 # contains tags with table of contents/figures |
171 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> | 200 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> |
172 tocs = [] | 201 tocs = [] |
173 for te in l: | 202 for te in l: |
183 tocs.append(toc) | 212 tocs.append(toc) |
184 | 213 |
185 # save as full_toc/full_figures | 214 # save as full_toc/full_figures |
186 docinfo['full_%s'%mode] = tocs | 215 docinfo['full_%s'%mode] = tocs |
187 | 216 |
217 # | |
218 # places | |
219 # | |
220 # | |
221 # toc | |
222 # | |
223 elif lt in ['places']: | |
224 # contains tags with place-ids | |
225 # <item id="N40004F-01"><ref>4</ref></item> | |
226 places = [] | |
227 for p in l: | |
228 if p.tag == 'item': | |
229 place = {} | |
230 place['id'] = p.get('id') | |
231 ref = p.find('ref') | |
232 place['pn'] = getInt(ref.text) | |
233 places.append(place) | |
234 | |
235 docinfo['places'] = places | |
236 | |
188 return docinfo | 237 return docinfo |
189 | 238 |
190 | 239 |
191 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): | 240 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): |
192 """returns single page from fulltext""" | 241 """returns single page from fulltext""" |
222 modes = mode.split(',') | 271 modes = mode.split(',') |
223 # check for multiple layers | 272 # check for multiple layers |
224 if len(modes) > 1: | 273 if len(modes) > 1: |
225 logging.debug("getTextPage: more than one mode=%s"%mode) | 274 logging.debug("getTextPage: more than one mode=%s"%mode) |
226 | 275 |
276 # mode defaults | |
277 gisMode = False | |
278 punditMode = False | |
279 | |
227 # search mode | 280 # search mode |
228 if 'search' in modes: | 281 if 'search' in modes: |
229 # add highlighting | 282 # add highlighting |
230 highlightQuery = pageinfo.get('highlightQuery', None) | 283 highlightQuery = pageinfo.get('highlightQuery', None) |
231 if highlightQuery: | 284 if highlightQuery: |
235 | 288 |
236 # ignore mode in the following | 289 # ignore mode in the following |
237 modes.remove('search') | 290 modes.remove('search') |
238 | 291 |
239 # pundit mode | 292 # pundit mode |
240 punditMode = False | |
241 if 'pundit' in modes: | 293 if 'pundit' in modes: |
242 punditMode = True | 294 punditMode = True |
243 # ignore mode in the following | 295 # ignore mode in the following |
244 modes.remove('pundit') | 296 modes.remove('pundit') |
245 | 297 |
250 elif 'xml' in modes: | 302 elif 'xml' in modes: |
251 textmode = 'xml' | 303 textmode = 'xml' |
252 textParams['outputFormat'] = 'xmlDisplay' | 304 textParams['outputFormat'] = 'xmlDisplay' |
253 normMode = 'orig' | 305 normMode = 'orig' |
254 elif 'gis' in modes: | 306 elif 'gis' in modes: |
255 #FIXME! | 307 gisMode = True |
256 textmode = 'gis' | 308 # gis mode uses plain text |
309 textmode = 'plain' | |
310 textParams['outputFormat'] = 'html' | |
257 else: | 311 else: |
258 # text is default mode | 312 # text is default mode |
259 textmode = 'plain' | 313 textmode = 'plain' |
260 textParams['outputFormat'] = 'html' | 314 textParams['outputFormat'] = 'html' |
261 | 315 |
266 except Exception, e: | 320 except Exception, e: |
267 logging.error("Error reading page: %s"%e) | 321 logging.error("Error reading page: %s"%e) |
268 return None | 322 return None |
269 | 323 |
270 # plain text or text-with-links mode | 324 # plain text or text-with-links mode |
271 if textmode == "plain" or textmode == "dict": | 325 if textmode == 'plain' or textmode == 'dict': |
272 # the text is in div@class=text | 326 # the text is in div@class=text |
273 pagediv = dom.find(".//div[@class='text']") | 327 pagediv = dom.find(".//div[@class='text']") |
274 logging.debug("pagediv: %s"%repr(pagediv)) | 328 logging.debug("pagediv: %s"%repr(pagediv)) |
275 if pagediv is not None: | 329 if pagediv is not None: |
276 # add textmode and normMode classes | 330 # add textmode and normMode classes |
294 l.set('target', '_blank') | 348 l.set('target', '_blank') |
295 | 349 |
296 if punditMode: | 350 if punditMode: |
297 self._addPunditAttributes(pagediv, pageinfo, docinfo) | 351 self._addPunditAttributes(pagediv, pageinfo, docinfo) |
298 | 352 |
353 if gisMode: | |
354 self._addGisTags(pagediv, pageinfo, docinfo) | |
355 | |
299 s = serialize(pagediv) | 356 s = serialize(pagediv) |
300 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) | 357 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) |
301 return s | 358 return s |
302 | 359 |
303 # xml mode | 360 # xml mode |
306 pagediv = dom.find(".//body") | 363 pagediv = dom.find(".//body") |
307 logging.debug("pagediv: %s"%repr(pagediv)) | 364 logging.debug("pagediv: %s"%repr(pagediv)) |
308 if pagediv is not None: | 365 if pagediv is not None: |
309 return serialize(pagediv) | 366 return serialize(pagediv) |
310 | 367 |
311 # pureXml mode WTF? | |
312 elif textmode == "pureXml": | |
313 # the text is in body | |
314 pagediv = dom.find(".//body") | |
315 logging.debug("pagediv: %s"%repr(pagediv)) | |
316 if pagediv is not None: | |
317 return serialize(pagediv) | |
318 | |
319 # gis mode FIXME! | |
320 elif textmode == "gis": | |
321 # the text is in div@class=text | |
322 pagediv = dom.find(".//div[@class='text']") | |
323 logging.debug("pagediv: %s"%repr(pagediv)) | |
324 if pagediv is not None: | |
325 # fix empty div tags | |
326 self._fixEmptyDivs(pagediv) | |
327 # check all a-tags | |
328 links = pagediv.findall(".//a") | |
329 # add our URL as backlink | |
330 selfurl = self.getLink() | |
331 doc = base64.b64encode(selfurl) | |
332 for l in links: | |
333 href = l.get('href') | |
334 if href: | |
335 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): | |
336 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) | |
337 l.set('target', '_blank') | |
338 | |
339 return serialize(pagediv) | |
340 | |
341 logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) | 368 logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) |
342 return None | 369 return None |
343 | 370 |
344 def _processWTags(self, textMode, normMode, pagediv): | 371 def _processWTags(self, textMode, normMode, pagediv): |
345 """selects the necessary information from w-spans and removes the rest from pagediv""" | 372 """selects the necessary information from w-spans and removes the rest from pagediv""" |
406 ppdiv = pagediv.find(".//span[@class='pb']/..") | 433 ppdiv = pagediv.find(".//span[@class='pb']/..") |
407 ppdiv.remove(pbdiv) | 434 ppdiv.remove(pbdiv) |
408 return pagediv | 435 return pagediv |
409 | 436 |
410 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): | 437 def _addPunditAttributes(self, pagediv, pageinfo, docinfo): |
411 """add about attributes for pundit annotation tool""" | 438 """add about-attributes to divs for pundit annotation tool""" |
412 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) | 439 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) |
413 pn = pageinfo.get('pn', '1') | 440 pn = pageinfo.get('pn', '1') |
414 # TODO: use pn as well? | |
415 # check all div-tags | 441 # check all div-tags |
416 divs = pagediv.findall(".//div") | 442 divs = pagediv.findall(".//div") |
417 for d in divs: | 443 for d in divs: |
418 id = d.get('id') | 444 id = d.get('id') |
419 if id: | 445 if id: |
420 # TODO: check path (cf RFC2396) | 446 # TODO: check path (cf RFC2396) |
421 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) | 447 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) |
422 cls = d.get('class','') | 448 cls = d.get('class','') |
423 cls += ' pundit-content' | 449 cls += ' pundit-content' |
424 d.set('class', cls.strip()) | 450 d.set('class', cls.strip()) |
451 | |
452 return pagediv | |
453 | |
454 def _addGisTags(self, pagediv, pageinfo, docinfo): | |
455 """add links for gis places""" | |
456 # use last part of documentPath as db-id | |
457 docpath = docinfo.get('documentPath', '') | |
458 textid = docpath.split('/')[-1] | |
459 # add our URL as backlink | |
460 selfurl = self.getLink() | |
461 doc = base64.b64encode(selfurl) | |
462 # check all span@class=place | |
463 spans = pagediv.findall(".//span[@class='place']") | |
464 for s in spans: | |
465 id = s.get('id') | |
466 if id: | |
467 # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis | |
468 s.tag = 'a' | |
469 # TODO: make links configurable | |
470 url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc) | |
471 s.set('href', url) | |
472 s.set('target', '_blank') | |
425 | 473 |
426 return pagediv | 474 return pagediv |
427 | 475 |
428 def _processFigures(self, pagediv, docinfo): | 476 def _processFigures(self, pagediv, docinfo): |
429 """processes figure-tags""" | 477 """processes figure-tags""" |