version 1.238.2.2, 2011/07/15 19:34:41
|
version 1.238.2.5, 2011/07/29 18:36:04
|
Line 12 import xml.etree.ElementTree as ET
|
Line 12 import xml.etree.ElementTree as ET
|
import re |
import re |
import logging |
import logging |
import urllib |
import urllib |
import documentViewer |
|
#from documentViewer import getTextFromNode, serializeNode |
|
|
|
def intOr0(s, default=0): |
from SrvTxtUtils import getInt, getText, getHttpData |
"""convert s to int or return default""" |
|
try: |
|
return int(s) |
|
except: |
|
return default |
|
|
|
def getText(node): |
|
"""get the cdata content of a node""" |
|
if node is None: |
|
return "" |
|
# ET: |
|
text = node.text or "" |
|
for e in node: |
|
text += gettext(e) |
|
if e.tail: |
|
text += e.tail |
|
|
|
return text |
|
|
|
def serialize(node): |
def serialize(node): |
"""returns a string containing an XML snippet of node""" |
"""returns a string containing an XML snippet of node""" |
Line 90 class MpdlXmlTextServer(SimpleItem):
|
Line 70 class MpdlXmlTextServer(SimpleItem):
|
manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
|
|
def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
|
|
"""constructor""" |
"""constructor""" |
self.id=id |
self.id=id |
self.title=title |
self.title=title |
Line 102 class MpdlXmlTextServer(SimpleItem):
|
Line 81 class MpdlXmlTextServer(SimpleItem):
|
|
|
def getHttpData(self, url, data=None): |
def getHttpData(self, url, data=None): |
"""returns result from url+data HTTP request""" |
"""returns result from url+data HTTP request""" |
return documentViewer.getHttpData(url,data,timeout=self.timeout) |
return getHttpData(url,data,timeout=self.timeout) |
|
|
def getServerData(self, method, data=None): |
def getServerData(self, method, data=None): |
"""returns result from text server for method+data""" |
"""returns result from text server for method+data""" |
url = self.serverUrl+method |
url = self.serverUrl+method |
return documentViewer.getHttpData(url,data,timeout=self.timeout) |
return getHttpData(url,data,timeout=self.timeout) |
|
|
# WTF: what does this really do? can it be integrated in getPage? |
# WTF: what does this really do? can it be integrated in getPage? |
def getSearch(self, pageinfo=None, docinfo=None): |
def getSearch(self, pageinfo=None, docinfo=None): |
Line 268 class MpdlXmlTextServer(SimpleItem):
|
Line 247 class MpdlXmlTextServer(SimpleItem):
|
|
|
# pageNumberOrigNorm |
# pageNumberOrigNorm |
elif dc == 'countFigureEntries': |
elif dc == 'countFigureEntries': |
docinfo['countFigureEntries'] = intOr0(div.text) |
docinfo['countFigureEntries'] = getInt(div.text) |
|
|
# pageNumberOrigNorm |
# pageNumberOrigNorm |
elif dc == 'countTocEntries': |
elif dc == 'countTocEntries': |
# WTF: s1 = int(s)/30+1 |
# WTF: s1 = int(s)/30+1 |
docinfo['countTocEntries'] = intOr0(div.text) |
docinfo['countTocEntries'] = getInt(div.text) |
|
|
# numTextPages |
# numTextPages |
elif dc == 'countPages': |
elif dc == 'countPages': |
np = intOr0(div.text) |
np = getInt(div.text) |
if np > 0: |
if np > 0: |
docinfo['numTextPages'] = np |
docinfo['numTextPages'] = np |
if docinfo.get('numPages', 0) == 0: |
if docinfo.get('numPages', 0) == 0: |
# seems to be text-only |
# seems to be text-only - update page count |
docinfo['numTextPages'] = np |
docinfo['numPages'] = np |
pageinfo['end'] = min(pageinfo['end'], np) |
pageinfo['end'] = min(pageinfo['end'], np) |
pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
if np % pageinfo['groupsize'] > 0: |
if np % pageinfo['groupsize'] > 0: |
Line 504 class MpdlXmlTextServer(SimpleItem):
|
Line 483 class MpdlXmlTextServer(SimpleItem):
|
pagediv = div |
pagediv = div |
|
|
elif dc == 'queryResultHits': |
elif dc == 'queryResultHits': |
docinfo['tocSize_%s'%mode] = intOr0(div.text) |
docinfo['tocSize_%s'%mode] = getInt(div.text) |
|
|
if pagediv: |
if pagediv: |
# # split xml in chunks |
|
# tocs = [] |
|
# tocdivs = pagediv.findall('div') |
|
# for p in zip(tocdivs[::2], tocdivs[1::2]): |
|
# toc = serialize(p[0]) |
|
# toc += serialize(p[1]) |
|
# tocs.append(toc) |
|
# logging.debug("pair: %s"%(toc)) |
|
# store XML in docinfo |
# store XML in docinfo |
docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') |
docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') |
|
|
Line 544 class MpdlXmlTextServer(SimpleItem):
|
Line 515 class MpdlXmlTextServer(SimpleItem):
|
viewMode= pageinfo['viewMode'] |
viewMode= pageinfo['viewMode'] |
tocMode = pageinfo['tocMode'] |
tocMode = pageinfo['tocMode'] |
tocPN = int(pageinfo['tocPN']) |
tocPN = int(pageinfo['tocPN']) |
|
pn = tocPN |
|
|
fulltoc = ET.fromstring(tocxml) |
fulltoc = ET.fromstring(tocxml) |
|
|
if fulltoc: |
if fulltoc: |
# paginate |
# paginate |
#start = (pn - 1) * pagesize * 2 |
start = (pn - 1) * pagesize * 2 |
#end = start + pagesize * 2 |
len = pagesize * 2 |
#tocdivs = fulltoc[start:end] |
del fulltoc[:start] |
|
del fulltoc[len:] |
tocdivs = fulltoc |
tocdivs = fulltoc |
|
|
# check all a-tags |
# check all a-tags |