# HG changeset patch
# User casties
# Date 1349972834 -7200
# Node ID 6949355741773d063ac6ec39c54d6d91303b42ea
# Parent 8b1e20bf300daeeebf2cf68e37ade3951f882c1d
more new MpiwgXmlTextServer.
diff -r 8b1e20bf300d -r 694935574177 MpiwgXmlTextServer.py
--- a/MpiwgXmlTextServer.py Thu Oct 11 10:21:49 2012 +0200
+++ b/MpiwgXmlTextServer.py Thu Oct 11 18:27:14 2012 +0200
@@ -93,7 +93,7 @@
logging.debug("getTextInfo mode=%s"%mode)
field = ''
- if mode in ['pages', 'toc', 'figures']:
+ if mode in ['pages', 'toc', 'figures', 'handwritten']:
# translate mode to field param
field = '&field=%s'%mode
else:
@@ -125,9 +125,12 @@
else:
if mode is None:
# get general info from system-tag
- cp = doc.find('system/countPages')
- if cp is not None:
- docinfo['numTextPages'] = getInt(cp.text)
+ sys = doc.find('system')
+ if sys is not None:
+ docinfo['numTextPages'] = getInt(getText(sys.find('countPages')))
+ docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures')))
+ docinfo['numHandwritten'] = getInt(getText(sys.find('countHandwritten')))
+ docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries')))
else:
# result is in list-tag
@@ -145,9 +148,9 @@
page = {}
pn = getInt(i.get('n'))
page['pn'] = pn
- no = getInt(i.get('o'))
+ no = i.get('o')
page['no'] = no
- non = getInt(i.get('o-norm'))
+ non = i.get('o-norm')
page['non'] = non
if pn > 0:
@@ -157,25 +160,21 @@
logging.debug("got pageNumbers=%s"%repr(pages))
# toc
- elif name == 'toc':
+ elif lt == 'toc' or lt == 'figures' or lt == 'handwritten':
# contains tags with table of contents/figures
- # 133Chapter I1.1
+ # - CAP.I.
[132]
tocs = []
- for te in tag:
- toc = {}
- for t in te:
- if t.tag == 'page':
- toc['pn'] = getInt(t.text)
- elif t.tag == 'level':
- toc['level'] = t.text
- elif t.tag == 'content':
- toc['content'] = t.text
- elif t.tag == 'level-string':
- toc['level-string'] = t.text
- elif t.tag == 'real-level':
- toc['real-level'] = t.text
-
- tocs.append(toc)
+ for te in l:
+ if te.tag == 'item':
+ toc = {}
+ toc['level-string'] = te.get('n')
+ toc['level'] = te.get('lv')
+ toc['content'] = te.text.strip()
+ ref = te.find('ref')
+ toc['pn'] = getInt(ref.text)
+ toc['no'] = ref.get('o')
+ toc['non'] = ref.get('o-norm')
+ tocs.append(toc)
# save as full_toc/full_figures
docinfo['full_%s'%mode] = tocs
@@ -183,34 +182,6 @@
return docinfo
- def processPageInfo(self, dom, docinfo, pageinfo):
- """processes page info divs from dom and stores in docinfo and pageinfo"""
- # assume first second level div is pageMeta
- alldivs = dom.find("div")
-
- if alldivs is None or alldivs.get('class', '') != 'pageMeta':
- logging.error("processPageInfo: pageMeta div not found!")
- return
-
- for div in alldivs:
- dc = div.get('class')
-
- # pageNumberOrig
- if dc == 'pageNumberOrig':
- pageinfo['pageNumberOrig'] = div.text
-
- # pageNumberOrigNorm
- elif dc == 'pageNumberOrigNorm':
- pageinfo['pageNumberOrigNorm'] = div.text
-
- # pageHeaderTitle
- elif dc == 'pageHeaderTitle':
- pageinfo['pageHeaderTitle'] = div.text
-
- #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
- return
-
-
def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
"""returns single page from fulltext"""
@@ -451,6 +422,8 @@
if mode == "none":
return docinfo
+ #TODO: put mode into query
+
cachedQuery = docinfo.get('cachedQuery', None)
if cachedQuery is not None:
# cached search result
@@ -461,39 +434,34 @@
else:
# different query
del docinfo['resultSize']
- del docinfo['resultXML']
+ del docinfo['results']
# cache query
docinfo['cachedQuery'] = '%s_%s'%(mode,query)
# fetch full results
docpath = docinfo['textURLPath']
- params = {'document': docpath,
- 'mode': 'text',
- 'queryType': mode,
+ params = {'docId': docpath,
'query': query,
- 'queryResultPageSize': 1000,
- 'queryResultPN': 1,
- 'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
- pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
- #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
- dom = ET.fromstring(pagexml)
- # page content is in
- pagediv = None
- # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
- alldivs = dom.findall("div")
- for div in alldivs:
- dc = div.get('class')
- # page content div
- if dc == 'queryResultPage':
- pagediv = div
+ 'pageSize': 1000,
+ 'page': 1,
+ 'outputFormat': 'html'}
+ pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
+ results = []
+ try:
+ dom = ET.fromstring(pagexml)
+ # page content is currently in multiple
+ alldivs = dom.findall(".//td[@align='left']")
+ for div in alldivs:
+ # TODO: can we put etree in the session?
+ results.append(div)
+
+ except Exception, e:
+ logging.error("GetSearchResults: Error parsing search result: %s"%e)
- elif dc == 'queryResultHits':
- docinfo['resultSize'] = getInt(div.text)
-
- if pagediv is not None:
- # store XML in docinfo
- docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')
+ # store results in docinfo
+ docinfo['resultSize'] = len(results)
+ docinfo['results'] = results
return docinfo
@@ -504,9 +472,9 @@
# get (cached) result
self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
- resultxml = docinfo.get('resultXML', None)
+ resultxml = docinfo.get('results', None)
if not resultxml:
- logging.error("getResultPage: unable to find resultXML")
+ logging.error("getResultPage: unable to find results")
return "Error: no result!"
if size is None:
@@ -561,6 +529,7 @@
return docinfo.get('full_%s'%queryType, [])
+
def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
"""returns single page from the table of contents"""
logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
@@ -583,8 +552,17 @@
for toc in tocs:
pageurl = self.getLink('pn', toc['pn'])
tp += ''
- tp += ' [%s %s] '%(toc['level-string'], toc['content'])
- tp += ' '%(pageurl, toc['pn'])
+ content = toc['content']
+ if content:
+ tp += ' [%s] %s '%(toc['level-string'], toc['content'])
+ else:
+ tp += ' [Figure %s] '%(toc['level-string'])
+
+ if toc.get('no', None):
+ tp += ' '%(pageurl, toc['pn'], toc['no'])
+ else:
+ tp += ' '%(pageurl, toc['pn'])
+
tp += ' \n'
tp += '\n'
diff -r 8b1e20bf300d -r 694935574177 css/docuviewer.css
--- a/css/docuviewer.css Thu Oct 11 10:21:49 2012 +0200
+++ b/css/docuviewer.css Thu Oct 11 18:27:14 2012 +0200
@@ -146,15 +146,16 @@
background-color: white;
}
-div.tocbody.text .toc,
-div.tocbody.figures .toc,
-div.tocbody.concordance .toc {
+div.tocbody.text .toc.name,
+div.tocbody.figures .toc.name,
+div.tocbody.concordance .toc.name {
float:left;
clear:right;
+ margin-right: 1em;
}
-div.tocbody.text .toc.float.right,
-div.tocbody.figures .toc.float.right,
-div.tocbody.concordance .toc.float.right {
+div.tocbody.text .toc.page,
+div.tocbody.figures .toc.page,
+div.tocbody.concordance .toc.page {
float:right;
}
@@ -272,11 +273,9 @@
div.col.main div.content.text span.pb span.o {
display: none;
}
+/* running head */
div.col.main div.content.text span.pb span.rhead {
- display: block;
-}
-/* running head */
-div.col.main div.content.text div.pageHeaderTitle {
+ display: block;
text-align: center;
margin-bottom: 1em;
}
diff -r 8b1e20bf300d -r 694935574177 documentViewer.py
--- a/documentViewer.py Thu Oct 11 10:21:49 2012 +0200
+++ b/documentViewer.py Thu Oct 11 18:27:14 2012 +0200
@@ -588,9 +588,11 @@
texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True)
if texttool:
docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
- # document info (including toc) from full text
+ # document info from full text server
if docinfo.get('textURLPath', None):
docinfo = self.getTextInfo(mode=None, docinfo=docinfo)
+ # include list of pages TODO: do we need this always?
+ docinfo = self.getTextInfo(mode='pages', docinfo=docinfo)
# bib info
bib = self.metadataService.getBibData(dom=metaDom)
|