Changeset 516:7d7b639d7be7 in documentViewer for MpdlXmlTextServer.py
- Timestamp:
- Mar 5, 2012, 5:04:49 PM (12 years ago)
- Branch:
- default
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r513 r516 71 71 return places 72 72 73 74 def getTextInfo(self, docinfo=None): 75 """reads document info, including page concordance, from text server""" 76 logging.debug("getDocInfo") 77 docpath = docinfo.get('textURLPath', None) 78 if docpath is None: 79 logging.error("getTextInfo: no textURLPath!") 80 return docinfo 81 82 # we need to set a result set size 83 pagesize = 10000 84 pn = 1 85 # fetch docinfo 86 pagexml = self.getServerData("doc-info.xql","document=%s&pageSize=%s&pn=%s"%(docpath,pagesize,pn)) 87 dom = ET.fromstring(pagexml) 88 # all info in tag <document> 89 doc = dom.find("document") 90 if doc is None: 91 logging.error("getTextInfo: unable to find document-tag!") 92 else: 93 # go through all child elements 94 for tag in doc: 95 name = tag.tag 96 # numTextPages 97 if name == 'countPages': 98 np = getInt(tag.text) 99 if np > 0: 100 docinfo['numTextPages'] = np 101 102 # numFigureEntries 103 elif name == 'countFigureEntries': 104 docinfo['numFigureEntries'] = getInt(tag.text) 105 106 # numTocEntries 107 elif name == 'countTocEntries': 108 # WTF: s1 = int(s)/30+1 109 docinfo['numTocEntries'] = getInt(tag.text) 110 111 # numPlaces 112 elif name == 'countPlaces': 113 docinfo['numPlaces'] = getInt(tag.text) 114 115 # pageNumbers 116 elif name == 'pageNumbers': 117 # contains tags with page numbers 118 # <pn><n>4</n><no>4</no><non/></pn> 119 # n=scan number, no=original page no, non=normalized original page no 120 # pageNumbers is a dict indexed by scan number 121 pages = {} 122 for pn in tag: 123 page = {} 124 n = 0 125 for p in pn: 126 if p.tag == 'n': 127 n = getInt(p.text) 128 page['n'] = n 129 elif p.tag == 'no': 130 page['no'] = p.text 131 elif p.tag == 'non': 132 page['non'] = p.text 133 134 if n > 0: 135 pages[n] = page 136 137 docinfo['pageNumbers'] = pages 138 logging.debug("got pageNumbers=%s"%repr(pages)) 139 140 return docinfo 141 73 142 74 143 def processPageInfo(self, dom, docinfo, pageinfo): … … 334 403 """returns single page from the table of contents""" 335 404 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) 336 # check for cached result 337 if not 'resultXML' in docinfo: 338 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) 405 # get (cached) result 406 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) 339 407 340 408 resultxml = docinfo.get('resultXML', None)
Note: See TracChangeset
for help on using the changeset viewer.