|
|
| version 1.238.2.2, 2011/07/15 19:34:41 | version 1.238.2.6, 2011/08/03 19:04:18 |
|---|---|
| Line 12 import xml.etree.ElementTree as ET | Line 12 import xml.etree.ElementTree as ET |
| import re | import re |
| import logging | import logging |
| import urllib | import urllib |
| import documentViewer | |
| #from documentViewer import getTextFromNode, serializeNode | |
| def intOr0(s, default=0): | from SrvTxtUtils import getInt, getText, getHttpData |
| """convert s to int or return default""" | |
| try: | |
| return int(s) | |
| except: | |
| return default | |
| def getText(node): | |
| """get the cdata content of a node""" | |
| if node is None: | |
| return "" | |
| # ET: | |
| text = node.text or "" | |
| for e in node: | |
| text += gettext(e) | |
| if e.tail: | |
| text += e.tail | |
| return text | |
| def serialize(node): | def serialize(node): |
| """returns a string containing an XML snippet of node""" | """returns a string containing an XML snippet of node""" |
| Line 90 class MpdlXmlTextServer(SimpleItem): | Line 70 class MpdlXmlTextServer(SimpleItem): |
| manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) | manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) |
| def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): | def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): |
| """constructor""" | """constructor""" |
| self.id=id | self.id=id |
| self.title=title | self.title=title |
| Line 102 class MpdlXmlTextServer(SimpleItem): | Line 81 class MpdlXmlTextServer(SimpleItem): |
| def getHttpData(self, url, data=None): | def getHttpData(self, url, data=None): |
| """returns result from url+data HTTP request""" | """returns result from url+data HTTP request""" |
| return documentViewer.getHttpData(url,data,timeout=self.timeout) | return getHttpData(url,data,timeout=self.timeout) |
| def getServerData(self, method, data=None): | def getServerData(self, method, data=None): |
| """returns result from text server for method+data""" | """returns result from text server for method+data""" |
| url = self.serverUrl+method | url = self.serverUrl+method |
| return documentViewer.getHttpData(url,data,timeout=self.timeout) | return getHttpData(url,data,timeout=self.timeout) |
| # WTF: what does this really do? can it be integrated in getPage? | # WTF: what does this really do? can it be integrated in getPage? |
| def getSearch(self, pageinfo=None, docinfo=None): | def getSearch(self, pageinfo=None, docinfo=None): |
| Line 248 class MpdlXmlTextServer(SimpleItem): | Line 227 class MpdlXmlTextServer(SimpleItem): |
| def processPageInfo(self, dom, docinfo, pageinfo): | def processPageInfo(self, dom, docinfo, pageinfo): |
| """processes page info divs from dom and stores in docinfo and pageinfo""" | """processes page info divs from dom and stores in docinfo and pageinfo""" |
| # process all toplevel divs | # assume first second level div is pageMeta |
| alldivs = dom.findall(".//div") | alldivs = dom.find("div") |
| pagediv = None | |
| for div in alldivs: | for div in alldivs: |
| dc = div.get('class') | dc = div.get('class') |
| # page content div | |
| if dc == 'pageContent': | |
| pagediv = div | |
| # pageNumberOrig | # pageNumberOrig |
| elif dc == 'pageNumberOrig': | if dc == 'pageNumberOrig': |
| pageinfo['pageNumberOrig'] = div.text | pageinfo['pageNumberOrig'] = div.text |
| # pageNumberOrigNorm | # pageNumberOrigNorm |
| Line 268 class MpdlXmlTextServer(SimpleItem): | Line 242 class MpdlXmlTextServer(SimpleItem): |
| # pageNumberOrigNorm | # pageNumberOrigNorm |
| elif dc == 'countFigureEntries': | elif dc == 'countFigureEntries': |
| docinfo['countFigureEntries'] = intOr0(div.text) | docinfo['numFigureEntries'] = getInt(div.text) |
| # pageNumberOrigNorm | # pageNumberOrigNorm |
| elif dc == 'countTocEntries': | elif dc == 'countTocEntries': |
| # WTF: s1 = int(s)/30+1 | # WTF: s1 = int(s)/30+1 |
| docinfo['countTocEntries'] = intOr0(div.text) | docinfo['numTocEntries'] = getInt(div.text) |
| # pageHeaderTitle | |
| elif dc == 'pageHeaderTitle': | |
| docinfo['pageHeaderTitle'] = div.text | |
| # numTextPages | # numTextPages |
| elif dc == 'countPages': | elif dc == 'countPages': |
| np = intOr0(div.text) | np = getInt(div.text) |
| if np > 0: | if np > 0: |
| docinfo['numTextPages'] = np | docinfo['numTextPages'] = np |
| if docinfo.get('numPages', 0) == 0: | if docinfo.get('numPages', 0) == 0: |
| # seems to be text-only | # seems to be text-only - update page count |
| docinfo['numTextPages'] = np | docinfo['numPages'] = np |
| pageinfo['end'] = min(pageinfo['end'], np) | pageinfo['end'] = min(pageinfo['end'], np) |
| pageinfo['numgroups'] = int(np / pageinfo['groupsize']) | pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
| if np % pageinfo['groupsize'] > 0: | if np % pageinfo['groupsize'] > 0: |
| Line 332 class MpdlXmlTextServer(SimpleItem): | Line 310 class MpdlXmlTextServer(SimpleItem): |
| # page content is in <div class="pageContent"> | # page content is in <div class="pageContent"> |
| pagediv = None | pagediv = None |
| # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] | # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] |
| alldivs = dom.findall(".//div") | # so we look at the second level divs |
| alldivs = dom.findall("div") | |
| for div in alldivs: | for div in alldivs: |
| dc = div.get('class') | dc = div.get('class') |
| # page content div | # page content div |
| Line 504 class MpdlXmlTextServer(SimpleItem): | Line 483 class MpdlXmlTextServer(SimpleItem): |
| pagediv = div | pagediv = div |
| elif dc == 'queryResultHits': | elif dc == 'queryResultHits': |
| docinfo['tocSize_%s'%mode] = intOr0(div.text) | docinfo['tocSize_%s'%mode] = getInt(div.text) |
| if pagediv: | if pagediv: |
| # # split xml in chunks | |
| # tocs = [] | |
| # tocdivs = pagediv.findall('div') | |
| # for p in zip(tocdivs[::2], tocdivs[1::2]): | |
| # toc = serialize(p[0]) | |
| # toc += serialize(p[1]) | |
| # tocs.append(toc) | |
| # logging.debug("pair: %s"%(toc)) | |
| # store XML in docinfo | # store XML in docinfo |
| docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') | docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') |
| Line 544 class MpdlXmlTextServer(SimpleItem): | Line 515 class MpdlXmlTextServer(SimpleItem): |
| viewMode= pageinfo['viewMode'] | viewMode= pageinfo['viewMode'] |
| tocMode = pageinfo['tocMode'] | tocMode = pageinfo['tocMode'] |
| tocPN = int(pageinfo['tocPN']) | tocPN = int(pageinfo['tocPN']) |
| pn = tocPN | |
| fulltoc = ET.fromstring(tocxml) | fulltoc = ET.fromstring(tocxml) |
| if fulltoc: | if fulltoc: |
| # paginate | # paginate |
| #start = (pn - 1) * pagesize * 2 | start = (pn - 1) * pagesize * 2 |
| #end = start + pagesize * 2 | len = pagesize * 2 |
| #tocdivs = fulltoc[start:end] | del fulltoc[:start] |
| del fulltoc[len:] | |
| tocdivs = fulltoc | tocdivs = fulltoc |
| # check all a-tags | # check all a-tags |