Changeset 458:48b135b089c8 in documentViewer for MpdlXmlTextServer.py
- Timestamp:
- Jul 19, 2011, 6:46:35 PM (13 years ago)
- Branch:
- elementtree
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r456 r458 13 13 import logging 14 14 import urllib 15 import documentViewer 16 #from documentViewer import getTextFromNode, serializeNode 17 18 def intOr0(s, default=0): 19 """convert s to int or return default""" 20 try: 21 return int(s) 22 except: 23 return default 24 25 def getText(node): 26 """get the cdata content of a node""" 27 if node is None: 28 return "" 29 # ET: 30 text = node.text or "" 31 for e in node: 32 text += gettext(e) 33 if e.tail: 34 text += e.tail 35 36 return text 15 16 from SrvTxtUtils import getInt, getText, getHttpData 37 17 38 18 def serialize(node): … … 91 71 92 72 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 93 94 73 """constructor""" 95 74 self.id=id … … 103 82 def getHttpData(self, url, data=None): 104 83 """returns result from url+data HTTP request""" 105 return documentViewer.getHttpData(url,data,timeout=self.timeout)84 return getHttpData(url,data,timeout=self.timeout) 106 85 107 86 def getServerData(self, method, data=None): 108 87 """returns result from text server for method+data""" 109 88 url = self.serverUrl+method 110 return documentViewer.getHttpData(url,data,timeout=self.timeout)89 return getHttpData(url,data,timeout=self.timeout) 111 90 112 91 # WTF: what does this really do? can it be integrated in getPage? … … 269 248 # pageNumberOrigNorm 270 249 elif dc == 'countFigureEntries': 271 docinfo['countFigureEntries'] = intOr0(div.text)250 docinfo['countFigureEntries'] = getInt(div.text) 272 251 273 252 # pageNumberOrigNorm 274 253 elif dc == 'countTocEntries': 275 254 # WTF: s1 = int(s)/30+1 276 docinfo['countTocEntries'] = intOr0(div.text)255 docinfo['countTocEntries'] = getInt(div.text) 277 256 278 257 # numTextPages 279 258 elif dc == 'countPages': 280 np = intOr0(div.text)259 np = getInt(div.text) 281 260 if np > 0: 282 261 docinfo['numTextPages'] = np … … 505 484 506 485 elif dc == 'queryResultHits': 507 docinfo['tocSize_%s'%mode] = intOr0(div.text)486 docinfo['tocSize_%s'%mode] = getInt(div.text) 508 487 509 488 if pagediv: 510 # # split xml in chunks511 # tocs = []512 # tocdivs = pagediv.findall('div')513 # for p in zip(tocdivs[::2], tocdivs[1::2]):514 # toc = serialize(p[0])515 # toc += serialize(p[1])516 # tocs.append(toc)517 # logging.debug("pair: %s"%(toc))518 489 # store XML in docinfo 519 490 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
Note: See TracChangeset
for help on using the changeset viewer.