# HG changeset patch # User casties # Date 1330374412 -3600 # Node ID d5a47f82e75525678017cf0e7f01d67318e881b5 # Parent 3c01e8f4e72b60a5c72c10d1c11d9a5d30cae505 more cleanup. search works mostly now. layers work better now. diff -r 3c01e8f4e72b -r d5a47f82e755 MpdlXmlTextServer.py --- a/MpdlXmlTextServer.py Tue Feb 21 19:23:52 2012 +0100 +++ b/MpdlXmlTextServer.py Mon Feb 27 21:26:52 2012 +0100 @@ -126,6 +126,7 @@ def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" + logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): @@ -138,29 +139,43 @@ logging.warning("getTextPage: current!=pn!") # stuff for constructing full urls - url = docinfo['url'] - urlmode = docinfo['mode'] - sn = pageinfo.get('sn', None) - highlightQuery = pageinfo.get('highlightQuery', None) - tocMode = pageinfo.get('tocMode', None) - tocPN = pageinfo.get('tocPN',None) characterNormalization = pageinfo.get('characterNormalization', None) - + moreTextParam = '' selfurl = docinfo['viewerUrl'] - if mode == "dict" or mode == "text_dict": + if not mode: + # default is dict + mode = 'text' + + modes = mode.split(',') + # check for multiple layers + if len(modes) > 1: + logging.debug("getTextPage: more than one mode=%s"%mode) + + # search mode + if 'search' in modes: + # add highlighting + highlightQuery = pageinfo.get('highlightQuery', None) + sn = pageinfo.get('sn', None) + if highlightQuery and sn: + moreTextParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + + # remove mode + modes.remove('search') + + # other modes don't combine + if 'dict' in modes: # dict is called textPollux in the backend - textmode = "textPollux" - elif not mode: - # default is text - mode = "text" - textmode = "text" + textmode = 'textPollux' + elif len(modes) == 0: + # text is default mode + textmode = 'text' else: - textmode = mode + # just take first mode + textmode = modes[0] textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) - if highlightQuery: - textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + textParam += moreTextParam # fetch the page pagexml = self.getServerData("page-fragment.xql",textParam) @@ -180,7 +195,7 @@ break # plain text mode - if mode == "text": + if textmode == "text": # get full url assuming documentViewer is parent selfurl = self.getLink() if pagediv is not None: @@ -194,7 +209,7 @@ return serialize(pagediv) # text-with-links mode - elif mode == "dict": + elif textmode == "textPollux": if pagediv is not None: viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() @@ -226,17 +241,17 @@ return serialize(pagediv) # xml mode - elif mode == "xml": + elif textmode == "xml": if pagediv is not None: return serialize(pagediv) # pureXml mode - elif mode == "pureXml": + elif textmode == "pureXml": if pagediv is not None: return serialize(pagediv) # gis mode - elif mode == "gis": + elif textmode == "gis": if pagediv is not None: # check all a-tags links = pagediv.findall(".//a") @@ -255,6 +270,108 @@ return None + def getSearchResults(self, mode, query=None, docinfo=None): + """loads list of search results and stores XML in docinfo""" + logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) + if mode == "none": + return docinfo + + if 'resultSize_%s_%s'%(mode,query) in docinfo: + # cached result + return docinfo + + docpath = docinfo['textURLPath'] + # we need to set a result set size + pagesize = 1000 + pn = 1 + # fetch full results + params = {'document': docpath, + 'mode': 'text', + 'queryType': mode, + 'query': query, + 'queryResultPageSize': 1000, + 'queryResultPN': 1} + pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) + #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) + dom = ET.fromstring(pagexml) + # page content is in
+ pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] + alldivs = dom.findall("div") + for div in alldivs: + dc = div.get('class') + # page content div + if dc == 'queryResultPage': + pagediv = div + + elif dc == 'queryResultHits': + docinfo['resultSize_%s_%s'%(mode,query)] = getInt(div.text) + + if pagediv: + # store XML in docinfo + docinfo['resultXML_%s_%s'%(mode,query)] = ET.tostring(pagediv, 'UTF-8') + + logging.debug("getSearchResults: pagediv=%s"%pagediv) + return docinfo + + + def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): + """returns single page from the table of contents""" + logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) + # check for cached TOC + #TODO: cache only one search + if not docinfo.has_key('resultXML_%s_%s'%(mode,query)): + self.getSearchResults(mode=mode, query=query, docinfo=docinfo) + + resultxml = docinfo.get('resultXML_%s_%s'%(mode,query), None) + if not resultxml: + logging.error("getResultPage: unable to find resultXML") + return "Error: no result!" + + if size is None: + size = pageinfo.get('searchResultPageSize', 20) + + if start is None: + start = (pn - 1) * size + + fullresult = ET.fromstring(resultxml) + + if fullresult: + # paginate + first = start + len = size + del fullresult[:first] + del fullresult[len:] + tocdivs = fullresult + + # check all a-tags + links = tocdivs.findall(".//a") + for l in links: + href = l.get('href') + if href: + # take pn from href + m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) + if m is not None: + # and create new url (assuming parent is documentViewer) + #TODO: add highlighting params + url = self.getLink('pn', m.group(1)) + l.set('href', url) + else: + logging.warning("getResultPage: Problem with link=%s"%href) + + # fix two-divs-per-row with containing div +# newtoc = ET.Element('div', {'class':'queryResultPage'}) +# for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): +# e = ET.Element('div',{'class':'tocline'}) +# e.append(d1) +# e.append(d2) +# newtoc.append(e) + + return serialize(tocdivs) + + return "ERROR: no results!" + + def getToc(self, mode="text", docinfo=None): """loads table of contents and stores XML in docinfo""" logging.debug("getToc mode=%s"%mode) diff -r 3c01e8f4e72b -r d5a47f82e755 MpdlXmlTextServer_old.py --- a/MpdlXmlTextServer_old.py Tue Feb 21 19:23:52 2012 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,520 +0,0 @@ - -from OFS.SimpleItem import SimpleItem -from Products.PageTemplates.PageTemplateFile import PageTemplateFile -from Ft.Xml import EMPTY_NAMESPACE, Parse -from Ft.Xml.Domlette import NonvalidatingReader - -import md5 -import sys -import logging -import urllib -import documentViewer -from documentViewer import getTextFromNode, serializeNode - -class MpdlXmlTextServer(SimpleItem): - """TextServer implementation for MPDL-XML eXist server""" - meta_type="MPDL-XML TextServer" - - manage_options=( - {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, - )+SimpleItem.manage_options - - manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) - - def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): - #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): - - """constructor""" - self.id=id - self.title=title - self.timeout = timeout - if serverName is None: - self.serverUrl = serverUrl - else: - self.serverUrl = "http://%s/mpdl/interface/"%serverName - - def getHttpData(self, url, data=None): - """returns result from url+data HTTP request""" - return documentViewer.getHttpData(url,data,timeout=self.timeout) - - def getServerData(self, method, data=None): - """returns result from text server for method+data""" - url = self.serverUrl+method - return documentViewer.getHttpData(url,data,timeout=self.timeout) - - def getSearch(self, pageinfo=None, docinfo=None): - """get search list""" - docpath = docinfo['textURLPath'] - url = docinfo['url'] - pagesize = pageinfo['queryPageSize'] - pn = pageinfo.get('searchPN',1) - sn = pageinfo['sn'] - highlightQuery = pageinfo['highlightQuery'] - query =pageinfo['query'] - queryType =pageinfo['queryType'] - viewMode= pageinfo['viewMode'] - tocMode = pageinfo['tocMode'] - characterNormalization = pageinfo['characterNormalization'] - #optionToggle = pageinfo['optionToggle'] - tocPN = pageinfo['tocPN'] - selfurl = self.absolute_url() - data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) - pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) - pagedom = Parse(pagexml) - - """ - pagedivs = pagedom.xpath("//div[@class='queryResultHits']") - if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")): - if len(pagedivs)>0: - docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - s1 = int(s)/10+1 - try: - docinfo['queryResultHits'] = int(s1) - logging.debug("SEARCH ENTRIES: %s"%(s1)) - except: - docinfo['queryResultHits'] = 0 - """ - if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): - pagedivs = pagedom.xpath("//div[@class='queryResultPage']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - if href.startswith('page-fragment.xql'): - selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) - hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) - #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) - return serializeNode(pagenode) - if (queryType=="fulltextMorph"): - pagedivs = pagedom.xpath("//div[@class='queryResult']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - if href.startswith('page-fragment.xql'): - selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) - hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) - if href.startswith('../lt/lemma.xql'): - hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") - return serializeNode(pagenode) - if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): - pagedivs= pagedom.xpath("//div[@class='queryResultPage']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) - if href.startswith('../lt/lex.xql'): - hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - if href.startswith('../lt/lemma.xql'): - hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - return serializeNode(pagenode) - return "no text here" - - def getGisPlaces(self, docinfo=None, pageinfo=None): - """ Show all Gis Places of whole Page""" - xpath='//place' - docpath = docinfo.get('textURLPath',None) - if not docpath: - return None - - url = docinfo['url'] - selfurl = self.absolute_url() - pn = pageinfo['current'] - hrefList=[] - myList= "" - text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) - dom = Parse(text) - result = dom.xpath("//result/resultPage/place") - for l in result: - hrefNode= l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue - hrefList.append(href) - myList = ",".join(hrefList) - #logging.debug("getGisPlaces :%s"%(myList)) - return myList - - def getAllGisPlaces (self, docinfo=None, pageinfo=None): - """Show all Gis Places of whole Book """ - xpath ='//echo:place' - docpath =docinfo['textURLPath'] - url = docinfo['url'] - selfurl =self.absolute_url() - pn =pageinfo['current'] - hrefList=[] - myList="" - text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) - dom =Parse(text) - result = dom.xpath("//result/resultPage/place") - - for l in result: - hrefNode = l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue - hrefList.append(href) - myList = ",".join(hrefList) - #logging.debug("getALLGisPlaces :%s"%(myList)) - return myList - - - def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): - """returns single page from fulltext""" - docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - url = docinfo.get('url',None) - name = docinfo.get('name',None) - pn =pageinfo['current'] - sn = pageinfo['sn'] - #optionToggle =pageinfo ['optionToggle'] - highlightQuery = pageinfo['highlightQuery'] - #mode = pageinfo ['viewMode'] - tocMode = pageinfo['tocMode'] - characterNormalization=pageinfo['characterNormalization'] - tocPN = pageinfo['tocPN'] - selfurl = self.absolute_url() - if mode == "text_dict": - textmode = "textPollux" - else: - textmode = mode - - textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) - if highlightQuery is not None: - textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) - - pagexml = self.getServerData("page-fragment.xql",textParam) - dom = Parse(pagexml) - #dom = NonvalidatingReader.parseStream(pagexml) - - #original Pages - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - - """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) - - #original Pages Norm - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) - """ - #figureEntries - pagedivs = dom.xpath("//div[@class='countFigureEntries']") - if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): - if len(pagedivs)>0: - docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countFigureEntries'] = int(s) - except: - docinfo['countFigureEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countFigureEntries'] = int(s1) - except: - docinfo['countFigureEntries'] = 0 - - #allPlaces - pagedivs = dom.xpath("//div[@class='countPlaces']") - if pagedivs == dom.xpath("//div[@class='countPlaces']"): - if len(pagedivs)>0: - docinfo['countPlaces']= getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - try: - docinfo['countPlaces'] = int(s) - except: - docinfo['countPlaces'] = 0 - - #tocEntries - pagedivs = dom.xpath("//div[@class='countTocEntries']") - if pagedivs == dom.xpath("//div[@class='countTocEntries']"): - if len(pagedivs)>0: - docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countTocEntries'] = int(s) - except: - docinfo['countTocEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countTocEntries'] = int(s1) - except: - docinfo['countTocEntries'] = 0 - - #numTextPages - pagedivs = dom.xpath("//div[@class='countPages']") - if pagedivs == dom.xpath("//div[@class='countPages']"): - if len(pagedivs)>0: - docinfo['numPages'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - - try: - docinfo['numPages'] = int(s) - #logging.debug("PAGE NUMBER: %s"%(s)) - - np = docinfo['numPages'] - pageinfo['end'] = min(pageinfo['end'], np) - pageinfo['numgroups'] = int(np / pageinfo['groupsize']) - if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 - except: - docinfo['numPages'] = 0 - - else: - #no full text -- init to 0 - docinfo['pageNumberOrig'] = 0 - docinfo['countFigureEntries'] = 0 - docinfo['countPlaces'] = 0 - docinfo['countTocEntries'] = 0 - docinfo['numPages'] = 0 - docinfo['pageNumberOrigNorm'] = 0 - #return docinfo - - # plain text mode - if mode == "text": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links = pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href= hrefNode.nodeValue - if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - return serializeNode(pagenode) - if mode == "xml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) - if mode == "gis": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links =pagenode.xpath("//a") - for l in links: - hrefNode =l.getAttributeNodeNS(None, u"href") - if hrefNode: - href=hrefNode.nodeValue - if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): - hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) - l.setAttributeNS(None, 'target', '_blank') - return serializeNode(pagenode) - - if mode == "pureXml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) - # text-with-links mode - if mode == "text_dict": - # first div contains text - #mode = pageinfo ['viewMode'] - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - # check all a-tags - links = pagenode.xpath("//a") - - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - - if hrefNode: - # is link with href - href = hrefNode.nodeValue - if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): - # is pollux link - selfurl = self.absolute_url() - # change href - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) - # add target - l.setAttributeNS(None, 'target', '_blank') - #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") - #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") - - if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): - selfurl = self.absolute_url() - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') - - if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - - return serializeNode(pagenode) - return "no text here" - - def getOrigPages(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrig'] - - def getOrigPagesNorm(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrigNorm'] - - - def getTranslate(self, word=None, language=None): - """translate into another languages""" - data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") - #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) - return data - - def getLemma(self, lemma=None, language=None): - """simular words lemma """ - data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") - return data - - def getLemmaQuery(self, query=None, language=None): - """simular words lemma """ - data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") - return data - - def getLex(self, query=None, language=None): - #simular words lemma - data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) - return data - - def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): - #number of - docpath = docinfo['textURLPath'] - pagesize = pageinfo['queryPageSize'] - pn = pageinfo['searchPN'] - query =pageinfo['query'] - queryType =pageinfo['queryType'] - tocSearch = 0 - tocDiv = None - - pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) - pagedom = Parse(pagexml) - numdivs = pagedom.xpath("//div[@class='queryResultHits']") - tocSearch = int(getTextFromNode(numdivs[0])) - tc=int((tocSearch/10)+1) - return tc - - def getToc(self, mode="text", docinfo=None): - """loads table of contents and stores in docinfo""" - if mode == "none": - return docinfo - if 'tocSize_%s'%mode in docinfo: - # cached toc - return docinfo - - docpath = docinfo['textURLPath'] - # we need to set a result set size - pagesize = 1000 - pn = 1 - if mode == "text": - queryType = "toc" - else: - queryType = mode - # number of entries in toc - tocSize = 0 - tocDiv = None - - pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) - - # post-processing downloaded xml - pagedom = Parse(pagexml) - # get number of entries - numdivs = pagedom.xpath("//div[@class='queryResultHits']") - if len(numdivs) > 0: - tocSize = int(getTextFromNode(numdivs[0])) - docinfo['tocSize_%s'%mode] = tocSize - return docinfo - - def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): - """returns single page from the table of contents""" - # TODO: this should use the cached TOC - if mode == "text": - queryType = "toc" - else: - queryType = mode - docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - pagesize = pageinfo['tocPageSize'] - pn = pageinfo['tocPN'] - url = docinfo['url'] - selfurl = self.absolute_url() - viewMode= pageinfo['viewMode'] - characterNormalization = pageinfo ['characterNormalization'] - #optionToggle =pageinfo ['optionToggle'] - tocMode = pageinfo['tocMode'] - tocPN = pageinfo['tocPN'] - - data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) - page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) - text = page.replace('mode=image','mode=texttool') - return text - - def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): - #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): - """change settings""" - self.title=title - self.timeout = timeout - self.serverUrl = serverUrl - if RESPONSE is not None: - RESPONSE.redirect('manage_main') - -# management methods -def manage_addMpdlXmlTextServerForm(self): - """Form for adding""" - pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) - return pt() - -def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): -#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): - """add zogiimage""" - newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) - self.Destination()._setObject(id, newObj) - if RESPONSE is not None: - RESPONSE.redirect('manage_main') \ No newline at end of file diff -r 3c01e8f4e72b -r d5a47f82e755 documentViewer.py --- a/documentViewer.py Tue Feb 21 19:23:52 2012 +0100 +++ b/documentViewer.py Mon Feb 27 21:26:52 2012 +0100 @@ -174,8 +174,16 @@ """returns full text content of page""" return self.template.fulltextclient.getTextPage(**args) + def getSearchResults(self, **args): + """loads list of search results and stores XML in docinfo""" + return self.template.fulltextclient.getSearchResults(**args) + + def getResultsPage(self, **args): + """returns one page of the search results""" + return self.template.fulltextclient.getResultsPage(**args) + def getToc(self, **args): - """returns the full table of contents (in internal format)""" + """loads table of contents and stores XML in docinfo""" return self.template.fulltextclient.getToc(**args) def getTocPage(self, **args): @@ -247,13 +255,14 @@ if tocMode != "thumbs": # get table of contents - docinfo = self.getToc(mode=tocMode, docinfo=docinfo) + self.getToc(mode=tocMode, docinfo=docinfo) # auto viewMode: text if there is a text else images if viewMode=="auto": if docinfo.get('textURL', None) or docinfo.get('textURLPath', None): viewMode = "text" - viewLayer = "dict" + if viewLayer is None: + viewLayer = "dict" else: viewMode = "images" @@ -262,11 +271,6 @@ viewMode = "text" viewLayer = "dict" - # stringify viewLayer - if isinstance(viewLayer, list): - logging.debug("index_html: viewLayer is list:%s"%viewLayer) - viewLayer = ','.join([t for t in viewLayer if t]) - pageinfo = self.getPageinfo(start=start, current=pn, docinfo=docinfo, viewMode=viewMode, viewLayer=viewLayer, tocMode=tocMode) # get template /template/viewer_$viewMode @@ -674,6 +678,22 @@ logging.debug("getPageInfo(current=%s, start=%s, rows=%s, cols=%s, viewMode=%s, viewLayer=%s, tocMode=%s)"%(current,start,rows,cols,viewMode,viewLayer,tocMode)) pageinfo = {} pageinfo['viewMode'] = viewMode + # split viewLayer if necessary + if isinstance(viewLayer,basestring): + viewLayer = viewLayer.split(',') + + if isinstance(viewLayer, list): + logging.debug("getPageinfo: viewLayer is list:%s"%viewLayer) + # save (unique) list in viewLayers + seen = set() + viewLayers = [l for l in viewLayer if l and l not in seen and not seen.add(l)] + pageinfo['viewLayers'] = viewLayers + # stringify viewLayer + viewLayer = ','.join(viewLayers) + else: + #create list + pageinfo['viewLayers'] = [viewLayer] + pageinfo['viewLayer'] = viewLayer pageinfo['tocMode'] = tocMode diff -r 3c01e8f4e72b -r d5a47f82e755 documentViewer_old.py --- a/documentViewer_old.py Tue Feb 21 19:23:52 2012 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,989 +0,0 @@ - -from OFS.Folder import Folder -from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate -from Products.PageTemplates.PageTemplateFile import PageTemplateFile -from AccessControl import ClassSecurityInfo -from AccessControl import getSecurityManager -from Globals import package_home -from Products.zogiLib.zogiLib import browserCheck - -from Ft.Xml import EMPTY_NAMESPACE, Parse -import Ft.Xml.Domlette -import os.path -import sys -import urllib -import urllib2 -import logging -import math -import urlparse -import cStringIO -import re -import string - -def logger(txt,method,txt2): - """logging""" - logging.info(txt+ txt2) - - -def getInt(number, default=0): - """returns always an int (0 in case of problems)""" - try: - return int(number) - except: - return int(default) - -def getTextFromNode(nodename): - """get the cdata content of a node""" - if nodename is None: - return "" - nodelist=nodename.childNodes - rc = "" - for node in nodelist: - if node.nodeType == node.TEXT_NODE: - rc = rc + node.data - return rc - -def serializeNode(node, encoding="utf-8"): - """returns a string containing node as XML""" - stream = cStringIO.StringIO() - #logging.debug("BUF: %s"%(stream)) - Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) - s = stream.getvalue() - #logging.debug("BUF: %s"%(s)) - stream.close() - return s - -def browserCheck(self): - """check the browsers request to find out the browser type""" - bt = {} - ua = self.REQUEST.get_header("HTTP_USER_AGENT") - bt['ua'] = ua - bt['isIE'] = False - bt['isN4'] = False - bt['versFirefox']="" - bt['versIE']="" - bt['versSafariChrome']="" - bt['versOpera']="" - - if string.find(ua, 'MSIE') > -1: - bt['isIE'] = True - else: - bt['isN4'] = (string.find(ua, 'Mozilla/4.') > -1) - # Safari oder Chrome identification - try: - nav = ua[string.find(ua, '('):] - nav1=ua[string.find(ua,')'):] - nav2=nav1[string.find(nav1,'('):] - nav3=nav2[string.find(nav2,')'):] - ie = string.split(nav, "; ")[1] - ie1 =string.split(nav1, " ")[2] - ie2 =string.split(nav3, " ")[1] - ie3 =string.split(nav3, " ")[2] - if string.find(ie3, "Safari") >-1: - bt['versSafariChrome']=string.split(ie2, "/")[1] - except: pass - # IE identification - try: - nav = ua[string.find(ua, '('):] - ie = string.split(nav, "; ")[1] - if string.find(ie, "MSIE") > -1: - bt['versIE'] = string.split(ie, " ")[1] - except:pass - # Firefox identification - try: - nav = ua[string.find(ua, '('):] - nav1=ua[string.find(ua,')'):] - if string.find(ie1, "Firefox") >-1: - nav5= string.split(ie1, "/")[1] - logging.debug("FIREFOX: %s"%(nav5)) - bt['versFirefox']=nav5[0:3] - except:pass - #Opera identification - try: - if string.find(ua,"Opera") >-1: - nav = ua[string.find(ua, '('):] - nav1=nav[string.find(nav,')'):] - bt['versOpera']=string.split(nav1,"/")[2] - except:pass - - bt['isMac'] = string.find(ua, 'Macintosh') > -1 - bt['isWin'] = string.find(ua, 'Windows') > -1 - bt['isIEWin'] = bt['isIE'] and bt['isWin'] - bt['isIEMac'] = bt['isIE'] and bt['isMac'] - bt['staticHTML'] = False - - return bt - - -def getParentDir(path): - """returns pathname shortened by one""" - return '/'.join(path.split('/')[0:-1]) - - -def getHttpData(url, data=None, num_tries=3, timeout=10): - """returns result from url+data HTTP request""" - # we do GET (by appending data to url) - if isinstance(data, str) or isinstance(data, unicode): - # if data is string then append - url = "%s?%s"%(url,data) - elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple): - # urlencode - url = "%s?%s"%(url,urllib.urlencode(data)) - - response = None - errmsg = None - for cnt in range(num_tries): - try: - logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) - if sys.version_info < (2, 6): - # set timeout on socket -- ugly :-( - import socket - socket.setdefaulttimeout(float(timeout)) - response = urllib2.urlopen(url) - else: - response = urllib2.urlopen(url,timeout=float(timeout)) - # check result? - break - except urllib2.HTTPError, e: - logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) - errmsg = str(e) - # stop trying - break - except urllib2.URLError, e: - logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) - errmsg = str(e) - # stop trying - #break - - if response is not None: - data = response.read() - response.close() - return data - - raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg)) - #return None - -## -## documentViewer class -## -class documentViewer(Folder): - """document viewer""" - meta_type="Document viewer" - - security=ClassSecurityInfo() - manage_options=Folder.manage_options+( - {'label':'main config','action':'changeDocumentViewerForm'}, - ) - - # templates and forms - viewer_main = PageTemplateFile('zpt/viewer_main', globals()) - toc_thumbs = PageTemplateFile('zpt/toc_thumbs', globals()) - toc_text = PageTemplateFile('zpt/toc_text', globals()) - toc_figures = PageTemplateFile('zpt/toc_figures', globals()) - page_main_images = PageTemplateFile('zpt/page_main_images', globals()) - page_main_double = PageTemplateFile('zpt/page_main_double', globals()) - page_main_text = PageTemplateFile('zpt/page_main_text', globals()) - page_main_text_dict = PageTemplateFile('zpt/page_main_text_dict', globals()) - page_main_gis =PageTemplateFile ('zpt/page_main_gis', globals()) - page_main_xml = PageTemplateFile('zpt/page_main_xml', globals()) - page_main_pureXml = PageTemplateFile('zpt/page_main_pureXml', globals()) - head_main = PageTemplateFile('zpt/head_main', globals()) - docuviewer_css = PageTemplateFile('css/docuviewer.css', globals()) - info_xml = PageTemplateFile('zpt/info_xml', globals()) - - - thumbs_main_rss = PageTemplateFile('zpt/thumbs_main_rss', globals()) - security.declareProtected('View management screens','changeDocumentViewerForm') - changeDocumentViewerForm = PageTemplateFile('zpt/changeDocumentViewer', globals()) - - - def __init__(self,id,imageScalerUrl=None,textServerName=None,title="",digilibBaseUrl=None,thumbcols=2,thumbrows=5,authgroups="mpiwg"): - """init document viewer""" - self.id=id - self.title=title - self.thumbcols = thumbcols - self.thumbrows = thumbrows - # authgroups is list of authorized groups (delimited by ,) - self.authgroups = [s.strip().lower() for s in authgroups.split(',')] - # create template folder so we can always use template.something - - templateFolder = Folder('template') - #self['template'] = templateFolder # Zope-2.12 style - self._setObject('template',templateFolder) # old style - try: - import MpdlXmlTextServer - textServer = MpdlXmlTextServer.MpdlXmlTextServer(id='fulltextclient',serverName=textServerName) - #templateFolder['fulltextclient'] = xmlRpcClient - templateFolder._setObject('fulltextclient',textServer) - except Exception, e: - logging.error("Unable to create MpdlXmlTextServer for fulltextclient: "+str(e)) - try: - from Products.zogiLib.zogiLib import zogiLib - zogilib = zogiLib(id="zogilib", title="zogilib for docuviewer", dlServerURL=imageScalerUrl, layout="book") - #templateFolder['zogilib'] = zogilib - templateFolder._setObject('zogilib',zogilib) - except Exception, e: - logging.error("Unable to create zogiLib for zogilib: "+str(e)) - - - # proxy text server methods to fulltextclient - def getTextPage(self, **args): - """get page""" - return self.template.fulltextclient.getTextPage(**args) - - def getOrigPages(self, **args): - """get page""" - return self.template.fulltextclient.getOrigPages(**args) - - def getOrigPagesNorm(self, **args): - """get page""" - return self.template.fulltextclient.getOrigPagesNorm(**args) - - def getQuery(self, **args): - """get query in search""" - return self.template.fulltextclient.getQuery(**args) - - def getSearch(self, **args): - """get search""" - return self.template.fulltextclient.getSearch(**args) - - def getGisPlaces(self, **args): - """get gis places""" - return self.template.fulltextclient.getGisPlaces(**args) - - def getAllGisPlaces(self, **args): - """get all gis places """ - return self.template.fulltextclient.getAllGisPlaces(**args) - - def getTranslate(self, **args): - """get translate""" - return self.template.fulltextclient.getTranslate(**args) - - def getLemma(self, **args): - """get lemma""" - return self.template.fulltextclient.getLemma(**args) - - def getLemmaQuery(self, **args): - """get query""" - return self.template.fulltextclient.getLemmaQuery(**args) - - def getLex(self, **args): - """get lex""" - return self.template.fulltextclient.getLex(**args) - - def getToc(self, **args): - """get toc""" - return self.template.fulltextclient.getToc(**args) - - def getTocPage(self, **args): - """get tocpage""" - return self.template.fulltextclient.getTocPage(**args) - - - security.declareProtected('View','thumbs_rss') - def thumbs_rss(self,mode,url,viewMode="auto",start=None,pn=1): - ''' - view it - @param mode: defines how to access the document behind url - @param url: url which contains display information - @param viewMode: if images display images, if text display text, default is images (text,images or auto) - - ''' - logging.debug("HHHHHHHHHHHHHH:load the rss") - logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) - - if not hasattr(self, 'template'): - # create template folder if it doesn't exist - self.manage_addFolder('template') - - if not self.digilibBaseUrl: - self.digilibBaseUrl = self.findDigilibUrl() or "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary" - - docinfo = self.getDocinfo(mode=mode,url=url) - #pageinfo = self.getPageinfo(start=start,current=pn,docinfo=docinfo) - pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo) - ''' ZDES ''' - pt = getattr(self.template, 'thumbs_main_rss') - - if viewMode=="auto": # automodus gewaehlt - if docinfo.has_key("textURL") or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert - viewMode="text" - else: - viewMode="images" - - return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode) - - security.declareProtected('View','index_html') - def index_html(self,url,mode="texttool",viewMode="auto",tocMode="thumbs",start=None,pn=1,mk=None): - ''' - view it - @param mode: defines how to access the document behind url - @param url: url which contains display information - @param viewMode: if images display images, if text display text, default is auto (text,images or auto) - @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none) - @param characterNormalization type of text display (reg, norm, none) - @param querySearch: type of different search modes (fulltext, fulltextMorph, xpath, xquery, ftIndex, ftIndexMorph, fulltextMorphLemma) - ''' - - logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) - - if not hasattr(self, 'template'): - # this won't work - logging.error("template folder missing!") - return "ERROR: template folder missing!" - - if not getattr(self, 'digilibBaseUrl', None): - self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary" - - docinfo = self.getDocinfo(mode=mode,url=url) - - if tocMode != "thumbs": - # get table of contents - docinfo = self.getToc(mode=tocMode, docinfo=docinfo) - - if viewMode=="auto": # automodus gewaehlt - if docinfo.has_key('textURL') or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert - viewMode="text_dict" - else: - viewMode="images" - - pageinfo = self.getPageinfo(start=start,current=pn, docinfo=docinfo,viewMode=viewMode,tocMode=tocMode) - - if (docinfo.get('textURLPath',None)): - page = self.getTextPage(docinfo=docinfo, pageinfo=pageinfo) - pageinfo['textPage'] = page - tt = getattr(self, 'template') - pt = getattr(tt, 'viewer_main') - return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode,mk=self.generateMarks(mk)) - - def generateMarks(self,mk): - ret="" - if mk is None: - return "" - if not isinstance(mk, list): - mk=[mk] - for m in mk: - ret+="mk=%s"%m - return ret - - - def getBrowser(self): - """getBrowser the version of browser """ - bt = browserCheck(self) - logging.debug("BROWSER VERSION: %s"%(bt)) - return bt - - def findDigilibUrl(self): - """try to get the digilib URL from zogilib""" - url = self.template.zogilib.getDLBaseUrl() - return url - - def getDocumentViewerURL(self): - """returns the URL of this instance""" - return self.absolute_url() - - def getStyle(self, idx, selected, style=""): - """returns a string with the given style and append 'sel' if path == selected.""" - #logger("documentViewer (getstyle)", logging.INFO, "idx: %s selected: %s style: %s"%(idx,selected,style)) - if idx == selected: - return style + 'sel' - else: - return style - - def getLink(self, param=None, val=None, params=None, baseUrl=None, paramSep='&'): - """returns URL to documentviewer with parameter param set to val or from dict params""" - # copy existing request params - urlParams=self.REQUEST.form.copy() - # change single param - if param is not None: - if val is None: - if urlParams.has_key(param): - del urlParams[param] - else: - urlParams[param] = str(val) - - # change more params - if params is not None: - for k in params.keys(): - v = params[k] - if v is None: - # val=None removes param - if urlParams.has_key(k): - del urlParams[k] - - else: - urlParams[k] = v - - # FIXME: does this belong here? - if urlParams.get("mode", None) == "filepath": #wenn beim erst Aufruf filepath gesetzt wurde aendere das nun zu imagepath - urlParams["mode"] = "imagepath" - urlParams["url"] = getParentDir(urlParams["url"]) - - # quote values and assemble into query string (not escaping '/') - ps = paramSep.join(["%s=%s"%(k,urllib.quote_plus(v,'/')) for (k, v) in urlParams.items()]) - #ps = urllib.urlencode(urlParams) - if baseUrl is None: - baseUrl = self.REQUEST['URL1'] - - url = "%s?%s"%(baseUrl, ps) - return url - - - def getLinkAmp(self, param=None, val=None, params=None, baseUrl=None): - """link to documentviewer with parameter param set to val""" - return self.getLink(param, val, params, baseUrl, '&') - - def getInfo_xml(self,url,mode): - """returns info about the document as XML""" - - if not self.digilibBaseUrl: - self.digilibBaseUrl = self.findDigilibUrl() or "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary" - - docinfo = self.getDocinfo(mode=mode,url=url) - pt = getattr(self.template, 'info_xml') - return pt(docinfo=docinfo) - - def getOptionToggle(self, newState=None, optionName='text_options_open', initialState=True): - """returns new option state""" - if not self.REQUEST.SESSION.has_key(optionName): - # not in session -- initial - opt = {'lastState': newState, 'state': initialState} - else: - opt = self.REQUEST.SESSION.get(optionName) - if opt['lastState'] != newState: - # state in session has changed -- toggle - opt['state'] = not opt['state'] - opt['lastState'] = newState - - self.REQUEST.SESSION[optionName] = opt - return opt['state'] - - def isAccessible(self, docinfo): - """returns if access to the resource is granted""" - access = docinfo.get('accessType', None) - logging.debug("documentViewer (accessOK) access type %s"%access) - if access is not None and access == 'free': - logging.debug("documentViewer (accessOK) access is free") - return True - elif access is None or access in self.authgroups: - # only local access -- only logged in users - user = getSecurityManager().getUser() - logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr())) - if user is not None: - #print "user: ", user - return (user.getUserName() != "Anonymous User") - else: - return False - - logging.error("documentViewer (accessOK) unknown access type %s"%access) - return False - - - def getDirinfoFromDigilib(self,path,docinfo=None,cut=0): - """gibt param von dlInfo aus""" - if docinfo is None: - docinfo = {} - - for x in range(cut): - - path=getParentDir(path) - - infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path - - logging.debug("documentViewer (getparamfromdigilib) dirInfo from %s"%(infoUrl)) - - txt = getHttpData(infoUrl) - if txt is None: - raise IOError("Unable to get dir-info from %s"%(infoUrl)) - - dom = Parse(txt) - sizes=dom.xpath("//dir/size") - logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes) - - if sizes: - docinfo['numPages'] = int(getTextFromNode(sizes[0])) - else: - docinfo['numPages'] = 0 - - # TODO: produce and keep list of image names and numbers - - return docinfo - - def getIndexMetaPath(self,url): - """gib nur den Pfad zurueck""" - regexp = re.compile(r".*(experimental|permanent)/(.*)") - regpath = regexp.match(url) - if (regpath==None): - return "" - logging.debug("(getDomFromIndexMeta): URLXAXA: %s"%regpath.group(2)) - return ("/mpiwg/online/"+regpath.group(1)+"/"+regpath.group(2)) - - - - def getIndexMetaUrl(self,url): - """returns utr of index.meta document at url""" - - metaUrl = None - if url.startswith("http://"): - # real URL - metaUrl = url - else: - # online path - server=self.digilibBaseUrl+"/servlet/Texter?fn=" - metaUrl=server+url.replace("/mpiwg/online","") - if not metaUrl.endswith("index.meta"): - metaUrl += "/index.meta" - - return metaUrl - - def getDomFromIndexMeta(self, url): - """get dom from index meta""" - dom = None - metaUrl = self.getIndexMetaUrl(url) - - logging.debug("(getDomFromIndexMeta): METAURL: %s"%metaUrl) - txt=getHttpData(metaUrl) - if txt is None: - raise IOError("Unable to read index meta from %s"%(url)) - - dom = Parse(txt) - return dom - - def getPresentationInfoXML(self, url): - """returns dom of info.xml document at url""" - dom = None - metaUrl = None - if url.startswith("http://"): - # real URL - metaUrl = url - else: - # online path - server=self.digilibBaseUrl+"/servlet/Texter?fn=" - metaUrl=server+url.replace("/mpiwg/online","") - - txt=getHttpData(metaUrl) - if txt is None: - raise IOError("Unable to read infoXMLfrom %s"%(url)) - - dom = Parse(txt) - return dom - - - def getAuthinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): - """gets authorization info from the index.meta file at path or given by dom""" - logging.debug("documentViewer (getauthinfofromindexmeta) path: %s"%(path)) - - access = None - - if docinfo is None: - docinfo = {} - - if dom is None: - for x in range(cut): - path=getParentDir(path) - dom = self.getDomFromIndexMeta(path) - - acctype = dom.xpath("//access-conditions/access/@type") - if acctype and (len(acctype)>0): - access=acctype[0].value - if access in ['group', 'institution']: - access = getTextFromNode(dom.xpath("//access-conditions/access/name")[0]).lower() - - docinfo['accessType'] = access - return docinfo - - - def getBibinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): - """gets bibliographical info from the index.meta file at path or given by dom""" - logging.debug("documentViewer (getbibinfofromindexmeta) path: %s"%(path)) - - if docinfo is None: - docinfo = {} - - if dom is None: - for x in range(cut): - path=getParentDir(path) - dom = self.getDomFromIndexMeta(path) - - docinfo['indexMetaPath']=self.getIndexMetaPath(path); - - logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path)) - # put in all raw bib fields as dict "bib" - bib = dom.xpath("//bib/*") - if bib and len(bib)>0: - bibinfo = {} - for e in bib: - bibinfo[e.localName] = getTextFromNode(e) - docinfo['bib'] = bibinfo - - # extract some fields (author, title, year) according to their mapping - metaData=self.metadata.main.meta.bib - bibtype=dom.xpath("//bib/@type") - if bibtype and (len(bibtype)>0): - bibtype=bibtype[0].value - else: - bibtype="generic" - - bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC) - docinfo['bib_type'] = bibtype - bibmap=metaData.generateMappingForType(bibtype) - logging.debug("documentViewer (getbibinfofromindexmeta) bibmap:"+repr(bibmap)) - logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype)) - # if there is no mapping bibmap is empty (mapping sometimes has empty fields) - if len(bibmap) > 0 and len(bibmap['author'][0]) > 0: - try: - docinfo['author']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['author'][0])[0]) - except: pass - try: - docinfo['title']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['title'][0])[0]) - except: pass - try: - docinfo['year']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['year'][0])[0]) - except: pass - logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype) - try: - docinfo['lang']=getTextFromNode(dom.xpath("//bib/lang")[0]) - except: - docinfo['lang']='' - try: - docinfo['city']=getTextFromNode(dom.xpath("//bib/city")[0]) - except: - docinfo['city']='' - try: - docinfo['number_of_pages']=getTextFromNode(dom.xpath("//bib/number_of_pages")[0]) - except: - docinfo['number_of_pages']='' - try: - docinfo['series_volume']=getTextFromNode(dom.xpath("//bib/series_volume")[0]) - except: - docinfo['series_volume']='' - try: - docinfo['number_of_volumes']=getTextFromNode(dom.xpath("//bib/number_of_volumes")[0]) - except: - docinfo['number_of_volumes']='' - try: - docinfo['translator']=getTextFromNode(dom.xpath("//bib/translator")[0]) - except: - docinfo['translator']='' - try: - docinfo['edition']=getTextFromNode(dom.xpath("//bib/edition")[0]) - except: - docinfo['edition']='' - try: - docinfo['series_author']=getTextFromNode(dom.xpath("//bib/series_author")[0]) - except: - docinfo['series_author']='' - try: - docinfo['publisher']=getTextFromNode(dom.xpath("//bib/publisher")[0]) - except: - docinfo['publisher']='' - try: - docinfo['series_title']=getTextFromNode(dom.xpath("//bib/series_title")[0]) - except: - docinfo['series_title']='' - try: - docinfo['isbn_issn']=getTextFromNode(dom.xpath("//bib/isbn_issn")[0]) - except: - docinfo['isbn_issn']='' - return docinfo - - - def getNameFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): - """gets name info from the index.meta file at path or given by dom""" - if docinfo is None: - docinfo = {} - - if dom is None: - for x in range(cut): - path=getParentDir(path) - dom = self.getDomFromIndexMeta(path) - - docinfo['name']=getTextFromNode(dom.xpath("/resource/name")[0]) - logging.debug("documentViewer docinfo[name] %s"%docinfo['name']) - return docinfo - - def getDocinfoFromTextTool(self, url, dom=None, docinfo=None): - """parse texttool tag in index meta""" - logging.debug("documentViewer (getdocinfofromtexttool) url: %s" % (url)) - if docinfo is None: - docinfo = {} - if docinfo.get('lang', None) is None: - docinfo['lang'] = '' # default keine Sprache gesetzt - if dom is None: - dom = self.getDomFromIndexMeta(url) - - archivePath = None - archiveName = None - - archiveNames = dom.xpath("//resource/name") - if archiveNames and (len(archiveNames) > 0): - archiveName = getTextFromNode(archiveNames[0]) - else: - logging.warning("documentViewer (getdocinfofromtexttool) resource/name missing in: %s" % (url)) - - archivePaths = dom.xpath("//resource/archive-path") - if archivePaths and (len(archivePaths) > 0): - archivePath = getTextFromNode(archivePaths[0]) - # clean up archive path - if archivePath[0] != '/': - archivePath = '/' + archivePath - if archiveName and (not archivePath.endswith(archiveName)): - archivePath += "/" + archiveName - else: - # try to get archive-path from url - logging.warning("documentViewer (getdocinfofromtexttool) resource/archive-path missing in: %s" % (url)) - if (not url.startswith('http')): - archivePath = url.replace('index.meta', '') - - if archivePath is None: - # we balk without archive-path - raise IOError("Missing archive-path (for text-tool) in %s" % (url)) - - imageDirs = dom.xpath("//texttool/image") - if imageDirs and (len(imageDirs) > 0): - imageDir = getTextFromNode(imageDirs[0]) - - else: - # we balk with no image tag / not necessary anymore because textmode is now standard - #raise IOError("No text-tool info in %s"%(url)) - imageDir = "" - #xquery="//pb" - docinfo['imagePath'] = "" # keine Bilder - docinfo['imageURL'] = "" - - if imageDir and archivePath: - #print "image: ", imageDir, " archivepath: ", archivePath - imageDir = os.path.join(archivePath, imageDir) - imageDir = imageDir.replace("/mpiwg/online", '') - docinfo = self.getDirinfoFromDigilib(imageDir, docinfo=docinfo) - docinfo['imagePath'] = imageDir - - docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imageDir - - viewerUrls = dom.xpath("//texttool/digiliburlprefix") - if viewerUrls and (len(viewerUrls) > 0): - viewerUrl = getTextFromNode(viewerUrls[0]) - docinfo['viewerURL'] = viewerUrl - - # old style text URL - textUrls = dom.xpath("//texttool/text") - if textUrls and (len(textUrls) > 0): - textUrl = getTextFromNode(textUrls[0]) - if urlparse.urlparse(textUrl)[0] == "": #keine url - textUrl = os.path.join(archivePath, textUrl) - # fix URLs starting with /mpiwg/online - if textUrl.startswith("/mpiwg/online"): - textUrl = textUrl.replace("/mpiwg/online", '', 1) - - docinfo['textURL'] = textUrl - - # new style text-url-path - textUrls = dom.xpath("//texttool/text-url-path") - if textUrls and (len(textUrls) > 0): - textUrl = getTextFromNode(textUrls[0]) - docinfo['textURLPath'] = textUrl - textUrlkurz = string.split(textUrl, ".")[0] - docinfo['textURLPathkurz'] = textUrlkurz - #if not docinfo['imagePath']: - # text-only, no page images - #docinfo = self.getNumTextPages(docinfo) - - - presentationUrls = dom.xpath("//texttool/presentation") - docinfo = self.getBibinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get info von bib tag - docinfo = self.getNameFromIndexMeta(url, docinfo=docinfo, dom=dom) - - - if presentationUrls and (len(presentationUrls) > 0): # ueberschreibe diese durch presentation informationen - # presentation url ergiebt sich ersetzen von index.meta in der url der fuer die Metadaten - # durch den relativen Pfad auf die presentation infos - presentationPath = getTextFromNode(presentationUrls[0]) - if url.endswith("index.meta"): - presentationUrl = url.replace('index.meta', presentationPath) - else: - presentationUrl = url + "/" + presentationPath - - docinfo = self.getBibinfoFromTextToolPresentation(presentationUrl, docinfo=docinfo, dom=dom) - - docinfo = self.getAuthinfoFromIndexMeta(url, docinfo=docinfo, dom=dom) # get access info - - return docinfo - - - def getBibinfoFromTextToolPresentation(self,url,docinfo=None,dom=None): - """gets the bibliographical information from the preseantion entry in texttools - """ - dom=self.getPresentationInfoXML(url) - try: - docinfo['author']=getTextFromNode(dom.xpath("//author")[0]) - except: - pass - try: - docinfo['title']=getTextFromNode(dom.xpath("//title")[0]) - except: - pass - try: - docinfo['year']=getTextFromNode(dom.xpath("//date")[0]) - except: - pass - return docinfo - - def getDocinfoFromImagePath(self,path,docinfo=None,cut=0): - """path ist the path to the images it assumes that the index.meta file is one level higher.""" - logging.debug("documentViewer (getdocinfofromimagepath) path: %s"%(path)) - if docinfo is None: - docinfo = {} - path=path.replace("/mpiwg/online","") - docinfo['imagePath'] = path - docinfo=self.getDirinfoFromDigilib(path,docinfo=docinfo,cut=cut) - - pathorig=path - for x in range(cut): - path=getParentDir(path) - logging.debug("documentViewer (getdocinfofromimagepath) PATH:"+path) - imageUrl=self.digilibBaseUrl+"/servlet/Scaler?fn="+path - docinfo['imageURL'] = imageUrl - - #path ist the path to the images it assumes that the index.meta file is one level higher. - docinfo = self.getBibinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1) - docinfo = self.getAuthinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1) - return docinfo - - - def getDocinfo(self, mode, url): - """returns docinfo depending on mode""" - logging.debug("documentViewer (getdocinfo) mode: %s, url: %s"%(mode,url)) - # look for cached docinfo in session - if self.REQUEST.SESSION.has_key('docinfo'): - docinfo = self.REQUEST.SESSION['docinfo'] - # check if its still current - if docinfo is not None and docinfo.get('mode') == mode and docinfo.get('url') == url: - logging.debug("documentViewer (getdocinfo) docinfo in session: %s"%docinfo) - return docinfo - # new docinfo - docinfo = {'mode': mode, 'url': url} - if mode=="texttool": #index.meta with texttool information - docinfo = self.getDocinfoFromTextTool(url, docinfo=docinfo) - elif mode=="imagepath": - docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo) - elif mode=="filepath": - docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo,cut=1) - else: - logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode) - raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode)) - - # FIXME: fake texturlpath - if not docinfo.has_key('textURLPath'): - docinfo['textURLPath'] = None - - logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo) - #logging.debug("documentViewer (getdocinfo) docinfo: %s"%) - self.REQUEST.SESSION['docinfo'] = docinfo - return docinfo - - def getPageinfo(self, current, start=None, rows=None, cols=None, docinfo=None, viewMode=None, tocMode=None): - """returns pageinfo with the given parameters""" - pageinfo = {} - current = getInt(current) - - pageinfo['current'] = current - rows = int(rows or self.thumbrows) - pageinfo['rows'] = rows - cols = int(cols or self.thumbcols) - pageinfo['cols'] = cols - grpsize = cols * rows - pageinfo['groupsize'] = grpsize - start = getInt(start, default=(math.ceil(float(current)/float(grpsize))*grpsize-(grpsize-1))) - # int(current / grpsize) * grpsize +1)) - pageinfo['start'] = start - pageinfo['end'] = start + grpsize - if (docinfo is not None) and ('numPages' in docinfo): - np = int(docinfo['numPages']) - pageinfo['end'] = min(pageinfo['end'], np) - pageinfo['numgroups'] = int(np / grpsize) - if np % grpsize > 0: - pageinfo['numgroups'] += 1 - pageinfo['viewMode'] = viewMode - pageinfo['tocMode'] = tocMode - pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg') - #pageinfo['optionToggle'] = self.REQUEST.get('optionToggle','1') - pageinfo['query'] = self.REQUEST.get('query','') - pageinfo['queryType'] = self.REQUEST.get('queryType','') - pageinfo['querySearch'] =self.REQUEST.get('querySearch', 'fulltext') - pageinfo['textPN'] = self.REQUEST.get('textPN','1') - pageinfo['highlightQuery'] = self.REQUEST.get('highlightQuery','') - pageinfo['tocPageSize'] = self.REQUEST.get('tocPageSize', '30') - pageinfo['queryPageSize'] =self.REQUEST.get('queryPageSize', '10') - pageinfo['tocPN'] = self.REQUEST.get('tocPN', '1') - toc = int (pageinfo['tocPN']) - pageinfo['textPages'] =int (toc) - - if 'tocSize_%s'%tocMode in docinfo: - tocSize = int(docinfo['tocSize_%s'%tocMode]) - tocPageSize = int(pageinfo['tocPageSize']) - # cached toc - if tocSize%tocPageSize>0: - tocPages=tocSize/tocPageSize+1 - else: - tocPages=tocSize/tocPageSize - pageinfo['tocPN'] = min (tocPages,toc) - pageinfo['searchPN'] =self.REQUEST.get('searchPN','1') - pageinfo['sn'] =self.REQUEST.get('sn','') - return pageinfo - -def changeDocumentViewer(self,title="",digilibBaseUrl=None,thumbrows=2,thumbcols=5,authgroups='mpiwg',RESPONSE=None): - """init document viewer""" - self.title=title - self.digilibBaseUrl = digilibBaseUrl - self.thumbrows = thumbrows - self.thumbcols = thumbcols - self.authgroups = [s.strip().lower() for s in authgroups.split(',')] - if RESPONSE is not None: - RESPONSE.redirect('manage_main') - -def manage_AddDocumentViewerForm(self): - """add the viewer form""" - pt=PageTemplateFile('zpt/addDocumentViewer', globals()).__of__(self) - return pt() - -def manage_AddDocumentViewer(self,id,imageScalerUrl="",textServerName="",title="",RESPONSE=None): - """add the viewer""" - newObj=documentViewer(id,imageScalerUrl=imageScalerUrl,title=title,textServerName=textServerName) - self._setObject(id,newObj) - - if RESPONSE is not None: - RESPONSE.redirect('manage_main') - -## DocumentViewerTemplate class -class DocumentViewerTemplate(ZopePageTemplate): - """Template for document viewer""" - meta_type="DocumentViewer Template" - - -def manage_addDocumentViewerTemplateForm(self): - """Form for adding""" - pt=PageTemplateFile('zpt/addDocumentViewerTemplate', globals()).__of__(self) - return pt() - -def manage_addDocumentViewerTemplate(self, id='viewer_main', title=None, text=None, - REQUEST=None, submit=None): - "Add a Page Template with optional file content." - - self._setObject(id, DocumentViewerTemplate(id)) - ob = getattr(self, id) - txt=file(os.path.join(package_home(globals()),'zpt/viewer_main.zpt'),'r').read() - logging.info("txt %s:"%txt) - ob.pt_edit(txt,"text/html") - if title: - ob.pt_setTitle(title) - try: - u = self.DestinationURL() - except AttributeError: - u = REQUEST['URL1'] - - u = "%s/%s" % (u, urllib.quote(id)) - REQUEST.RESPONSE.redirect(u+'/manage_main') - return '' - - - diff -r 3c01e8f4e72b -r d5a47f82e755 zpt/viewer_text.zpt --- a/zpt/viewer_text.zpt Tue Feb 21 19:23:52 2012 +0100 +++ b/zpt/viewer_text.zpt Mon Feb 27 21:26:52 2012 +0100 @@ -2,7 +2,8 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> @@ -28,6 +29,8 @@ tal:define="docpath docinfo/textURLPath; pn pageinfo/pn; flowLtr python:pageinfo.get('pageFlow','ltr')!='rtl'; + query python:request.get('query', None); + queryType python:request.get('queryType','fulltextMorph'); textPage python:here.getTextPage(mode=viewLayer, pn=pn, docinfo=docinfo, pageinfo=pageinfo) or '[no text here]';">
@@ -57,75 +60,152 @@
- +

Text display

+
           
Text
   Dictionary
-
-    Places
-
XML
+ tal:attributes="name param; value python:params[param]" /> +
    +
  • + Text +
      +
    • + + Dictionary +
    • +
    • + Search hits +
    • +
    • + Places
      +
    • +
    +
  • +
  • + XML
    +
  • +
+ +
+

Search

+
+ + + + + + + + + +
    +
  • + Exact +
  • +
  • + All forms +
  • +
  • + Fulltext index +
  • +
  • + Morphological index +
  • +
+
+
+ +

Text size

-
-   S M L -
+
-
+

Dictionary view

- Tab
Window
+
    +
  • + Tab +
  • +
  • + Window +
  • +
-
+

Text normalization

Original
Regularized
- Normalized
+ tal:attributes="name param; value python:params[param]" /> +
    +
  • + Original +
  • +
  • + Regularized +
  • +
  • + + Normalized +
  • +
+
-
+
+ +
+

Search results

+
+
+
+