annotate MpdlXmlTextServer.py @ 559:eabfbad6aeb4

"extended" layer for index view and some bugfixes.
author casties
date Fri, 28 Sep 2012 18:50:59 +0200
parents c56bc63436de
children 9255acc4518d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
1 from OFS.SimpleItem import SimpleItem
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
2 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
3
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
4 import xml.etree.ElementTree as ET
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
5
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
6 import re
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
7 import logging
134
6a33aa624ba4 fixed more oopsies
casties
parents: 133
diff changeset
8 import urllib
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
9 import urlparse
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
10 import base64
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
11
458
48b135b089c8 more renovation
casties
parents: 456
diff changeset
12 from SrvTxtUtils import getInt, getText, getHttpData
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
13
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
14 def serialize(node):
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
15 """returns a string containing an XML snippet of node"""
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
16 s = ET.tostring(node, 'UTF-8')
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
17 # snip off XML declaration
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
18 if s.startswith('<?xml'):
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
19 i = s.find('?>')
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
20 return s[i+3:]
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
21
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
22 return s
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
23
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
24
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
25 class MpdlXmlTextServer(SimpleItem):
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
26 """TextServer implementation for MPDL-XML eXist server"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
27 meta_type="MPDL-XML TextServer"
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
28
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
29 manage_options=(
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
30 {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
31 )+SimpleItem.manage_options
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
32
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
33 manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
34
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
35 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
36 """constructor"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
37 self.id=id
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
38 self.title=title
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
39 self.timeout = timeout
132
39167e96e582 fixed creation of text server on __init__
casties
parents: 129
diff changeset
40 if serverName is None:
39167e96e582 fixed creation of text server on __init__
casties
parents: 129
diff changeset
41 self.serverUrl = serverUrl
39167e96e582 fixed creation of text server on __init__
casties
parents: 129
diff changeset
42 else:
39167e96e582 fixed creation of text server on __init__
casties
parents: 129
diff changeset
43 self.serverUrl = "http://%s/mpdl/interface/"%serverName
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
44
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
45 def getHttpData(self, url, data=None):
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
46 """returns result from url+data HTTP request"""
458
48b135b089c8 more renovation
casties
parents: 456
diff changeset
47 return getHttpData(url,data,timeout=self.timeout)
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
48
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
49 def getServerData(self, method, data=None):
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
50 """returns result from text server for method+data"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
51 url = self.serverUrl+method
458
48b135b089c8 more renovation
casties
parents: 456
diff changeset
52 return getHttpData(url,data,timeout=self.timeout)
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
53
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
54
559
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
55 def getTextDownloadUrl(self, type='xml', docinfo=None):
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
56 """returns a URL to download the current text"""
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
57 docpath = docinfo.get('textURLPath', None)
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
58 if not docpath:
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
59 return None
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
60
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
61 docpath = docpath.replace('.xml','.'+type)
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
62 url = '%sgetDoc?doc=%s'%(self.serverUrl.replace('interface/',''), docpath)
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
63 return url
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
64
eabfbad6aeb4 "extended" layer for index view and some bugfixes.
casties
parents: 554
diff changeset
65
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
66 def getPlacesOnPage(self, docinfo=None, pn=None):
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
67 """Returns list of GIS places of page pn"""
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
68 docpath = docinfo.get('textURLPath',None)
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
69 if not docpath:
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
70 return None
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
71
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
72 places=[]
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
73 text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
74 dom = ET.fromstring(text)
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
75 result = dom.findall(".//resultPage/place")
236
ccdce5aca47b *** empty log message ***
abukhman
parents: 233
diff changeset
76 for l in result:
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
77 id = l.get("id")
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
78 name = l.text
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
79 place = {'id': id, 'name': name}
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
80 places.append(place)
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
81
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
82 return places
307
ec5e920a61e6 *** empty log message ***
abukhman
parents: 306
diff changeset
83
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
84
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
85 def getTextInfo(self, mode='', docinfo=None):
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
86 """reads document info, including page concordance, from text server"""
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
87 logging.debug("getTextInfo mode=%s"%mode)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
88 if mode not in ['toc', 'figures', '']:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
89 mode = ''
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
90 # check cached info
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
91 if mode:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
92 # cached toc-request?
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
93 if 'full_%s'%mode in docinfo:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
94 return docinfo
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
95
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
96 else:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
97 # no toc-request
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
98 if 'numTextPages' in docinfo:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
99 return docinfo
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
100
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
101 docpath = docinfo.get('textURLPath', None)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
102 if docpath is None:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
103 logging.error("getTextInfo: no textURLPath!")
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
104 return docinfo
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
105
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
106 # we need to set a result set size
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
107 pagesize = 10000
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
108 pn = 1
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
109 # fetch docinfo
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
110 pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn))
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
111 dom = ET.fromstring(pagexml)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
112 # all info in tag <document>
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
113 doc = dom.find("document")
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
114 if doc is None:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
115 logging.error("getTextInfo: unable to find document-tag!")
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
116 else:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
117 # go through all child elements
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
118 for tag in doc:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
119 name = tag.tag
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
120 # numTextPages
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
121 if name == 'countPages':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
122 np = getInt(tag.text)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
123 if np > 0:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
124 docinfo['numTextPages'] = np
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
125
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
126 # numFigureEntries
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
127 elif name == 'countFigureEntries':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
128 docinfo['numFigureEntries'] = getInt(tag.text)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
129
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
130 # numTocEntries
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
131 elif name == 'countTocEntries':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
132 # WTF: s1 = int(s)/30+1
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
133 docinfo['numTocEntries'] = getInt(tag.text)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
134
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
135 # numPlaces
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
136 elif name == 'countPlaces':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
137 docinfo['numPlaces'] = getInt(tag.text)
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
138
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
139 # pageNumbers
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
140 elif name == 'pageNumbers':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
141 # contains tags with page numbers
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
142 # <pn><n>4</n><no>4</no><non/></pn>
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
143 # n=scan number, no=original page no, non=normalized original page no
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
144 # pageNumbers is a dict indexed by scan number
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
145 pages = {}
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
146 for pn in tag:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
147 page = {}
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
148 n = 0
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
149 for p in pn:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
150 if p.tag == 'n':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
151 n = getInt(p.text)
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
152 page['pn'] = n
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
153 elif p.tag == 'no':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
154 page['no'] = p.text
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
155 elif p.tag == 'non':
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
156 page['non'] = p.text
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
157
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
158 if n > 0:
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
159 pages[n] = page
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
160
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
161 docinfo['pageNumbers'] = pages
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
162 #logging.debug("got pageNumbers=%s"%repr(pages))
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
163
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
164 # toc
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
165 elif name == 'toc':
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
166 # contains tags with table of contents/figures
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
167 # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry>
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
168 tocs = []
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
169 for te in tag:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
170 toc = {}
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
171 for t in te:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
172 if t.tag == 'page':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
173 toc['pn'] = getInt(t.text)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
174 elif t.tag == 'level':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
175 toc['level'] = t.text
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
176 elif t.tag == 'content':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
177 toc['content'] = t.text
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
178 elif t.tag == 'level-string':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
179 toc['level-string'] = t.text
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
180 elif t.tag == 'real-level':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
181 toc['real-level'] = t.text
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
182
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
183 tocs.append(toc)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
184
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
185 # save as full_toc/full_figures
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
186 docinfo['full_%s'%mode] = tocs
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
187
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
188 return docinfo
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
189
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
190
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
191 def processPageInfo(self, dom, docinfo, pageinfo):
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
192 """processes page info divs from dom and stores in docinfo and pageinfo"""
469
15394486ab75 working with new templates
casties
parents: 465
diff changeset
193 # assume first second level div is pageMeta
15394486ab75 working with new templates
casties
parents: 465
diff changeset
194 alldivs = dom.find("div")
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
195
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
196 if alldivs is None or alldivs.get('class', '') != 'pageMeta':
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
197 logging.error("processPageInfo: pageMeta div not found!")
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
198 return
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
199
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
200 for div in alldivs:
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
201 dc = div.get('class')
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
202
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
203 # pageNumberOrig
469
15394486ab75 working with new templates
casties
parents: 465
diff changeset
204 if dc == 'pageNumberOrig':
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
205 pageinfo['pageNumberOrig'] = div.text
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
206
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
207 # pageNumberOrigNorm
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
208 elif dc == 'pageNumberOrigNorm':
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
209 pageinfo['pageNumberOrigNorm'] = div.text
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
210
474
0bc4a153863a more cleanup (before reversing macro order)
casties
parents: 473
diff changeset
211 # pageHeaderTitle
0bc4a153863a more cleanup (before reversing macro order)
casties
parents: 473
diff changeset
212 elif dc == 'pageHeaderTitle':
0bc4a153863a more cleanup (before reversing macro order)
casties
parents: 473
diff changeset
213 pageinfo['pageHeaderTitle'] = div.text
517
aaacdf551f6f remove global info from processPageInfo.
casties
parents: 516
diff changeset
214
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
215 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
216 return
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
217
388
0265ab93716a *** empty log message ***
abukhman
parents: 386
diff changeset
218
471
415a7026eeda split viewMode in viewMode and viewType
casties
parents: 469
diff changeset
219 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
220 """returns single page from fulltext"""
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
221
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
222 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
223 # check for cached text -- but ideally this shouldn't be called twice
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
224 if pageinfo.has_key('textPage'):
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
225 logging.debug("getTextPage: using cached text")
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
226 return pageinfo['textPage']
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
227
530
5c7433c2515c fix problems with texttool/text
casties
parents: 529
diff changeset
228 docpath = docinfo.get('textURLPath', None)
5c7433c2515c fix problems with texttool/text
casties
parents: 529
diff changeset
229 if not docpath:
5c7433c2515c fix problems with texttool/text
casties
parents: 529
diff changeset
230 return None
5c7433c2515c fix problems with texttool/text
casties
parents: 529
diff changeset
231
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
232 # just checking
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
233 if pageinfo['current'] != pn:
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
234 logging.warning("getTextPage: current!=pn!")
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
235
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
236 # stuff for constructing full urls
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
237 selfurl = docinfo['viewerUrl']
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
238 textParams = {'document': docpath,
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
239 'pn': pn}
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
240 if 'characterNormalization' in pageinfo:
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
241 textParams['characterNormalization'] = pageinfo['characterNormalization']
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
242
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
243 if not mode:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
244 # default is dict
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
245 mode = 'text'
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
246
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
247 modes = mode.split(',')
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
248 # check for multiple layers
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
249 if len(modes) > 1:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
250 logging.debug("getTextPage: more than one mode=%s"%mode)
527
652cc8d3f1a9 fixed bug with text when layer is only annotator.
casties
parents: 519
diff changeset
251
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
252 # search mode
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
253 if 'search' in modes:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
254 # add highlighting
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
255 highlightQuery = pageinfo.get('highlightQuery', None)
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
256 if highlightQuery:
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
257 textParams['highlightQuery'] = highlightQuery
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
258 textParams['highlightElement'] = pageinfo.get('highlightElement', '')
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
259 textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
260
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
261 # ignore mode in the following
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
262 modes.remove('search')
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
263
551
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
264 # pundit mode
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
265 punditMode = False
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
266 if 'pundit' in modes:
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
267 punditMode = True
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
268 # ignore mode in the following
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
269 modes.remove('pundit')
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
270
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
271 # other modes don't combine
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
272 if 'dict' in modes:
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
273 # dict is called textPollux in the backend
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
274 textmode = 'textPollux'
529
f0e28d31ebc6 fixed bug with viewmode xml.
casties
parents: 527
diff changeset
275 elif 'xml' in modes:
f0e28d31ebc6 fixed bug with viewmode xml.
casties
parents: 527
diff changeset
276 # xml mode
f0e28d31ebc6 fixed bug with viewmode xml.
casties
parents: 527
diff changeset
277 textmode = 'xml'
f0e28d31ebc6 fixed bug with viewmode xml.
casties
parents: 527
diff changeset
278 textParams['characterNormalization'] = 'orig'
544
6c529ec1b295 added gis mode back.
casties
parents: 530
diff changeset
279 elif 'gis' in modes:
6c529ec1b295 added gis mode back.
casties
parents: 530
diff changeset
280 textmode = 'gis'
527
652cc8d3f1a9 fixed bug with text when layer is only annotator.
casties
parents: 519
diff changeset
281 else:
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
282 # text is default mode
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
283 textmode = 'text'
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
284
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
285 textParams['mode'] = textmode
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
286
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
287 # fetch the page
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
288 pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
289 dom = ET.fromstring(pagexml)
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
290 # extract additional info
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
291 self.processPageInfo(dom, docinfo, pageinfo)
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
292 # page content is in <div class="pageContent">
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
293 pagediv = None
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
294 # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
469
15394486ab75 working with new templates
casties
parents: 465
diff changeset
295 # so we look at the second level divs
554
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
296 alldivs = dom.findall('div')
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
297 for div in alldivs:
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
298 dc = div.get('class')
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
299 # page content div
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
300 if dc == 'pageContent':
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
301 pagediv = div
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
302 break
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
303
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
304 # plain text mode
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
305 if textmode == "text":
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
306 # get full url assuming documentViewer is parent
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
307 selfurl = self.getLink()
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
308 if pagediv is not None:
551
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
309 if punditMode:
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
310 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
311
554
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
312 # fix empty div tags
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
313 divs = pagediv.findall('.//div')
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
314 for d in divs:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
315 if len(d) == 0 and not d.text:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
316 # make empty divs non-empty
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
317 d.text = ' '
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
318
551
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
319 # check all a-tags
554
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
320 links = pagediv.findall('.//a')
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
321 for l in links:
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
322 href = l.get('href')
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
323 if href and href.startswith('#note-'):
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
324 href = href.replace('#note-',"%s#note-"%selfurl)
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
325 l.set('href', href)
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
326
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
327 return serialize(pagediv)
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
328
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
329 # text-with-links mode
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
330 elif textmode == "textPollux":
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
331 if pagediv is not None:
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
332 viewerurl = docinfo['viewerUrl']
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
333 selfurl = self.getLink()
551
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
334 if punditMode:
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
335 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
336
554
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
337 # fix empty div tags
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
338 divs = pagediv.findall('.//div')
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
339 for d in divs:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
340 if len(d) == 0 and not d.text:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
341 # make empty divs non-empty
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
342 d.text = ' '
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
343
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
344 # check all a-tags
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
345 links = pagediv.findall(".//a")
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
346 for l in links:
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
347 href = l.get('href')
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
348
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
349 if href:
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
350 # is link with href
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
351 linkurl = urlparse.urlparse(href)
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
352 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
353 if linkurl.path.endswith('GetDictionaryEntries'):
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
354 #TODO: replace wordInfo page
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
355 # is dictionary link - change href (keeping parameters)
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
356 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
357 # add target to open new page
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
358 l.set('target', '_blank')
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
359
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
360 if href.startswith('#note-'):
478
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
361 # note link
cd37d6f8d5e8 more cleanup
casties
parents: 477
diff changeset
362 l.set('href', href.replace('#note-',"%s#note-"%selfurl))
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
363
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
364 return serialize(pagediv)
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
365
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
366 # xml mode
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
367 elif textmode == "xml":
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
368 if pagediv is not None:
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
369 return serialize(pagediv)
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
370
544
6c529ec1b295 added gis mode back.
casties
parents: 530
diff changeset
371 # pureXml mode WTF?
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
372 elif textmode == "pureXml":
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
373 if pagediv is not None:
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
374 return serialize(pagediv)
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
375
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
376 # gis mode
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
377 elif textmode == "gis":
473
74e9e74277e9 smaller improvements
casties
parents: 471
diff changeset
378 if pagediv is not None:
554
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
379 # fix empty div tags
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
380 divs = pagediv.findall('.//div')
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
381 for d in divs:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
382 if len(d) == 0 and not d.text:
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
383 # make empty divs non-empty
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
384 d.text = ' '
c56bc63436de fixed problem with empty div tags in fulltext.
casties
parents: 551
diff changeset
385
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
386 # check all a-tags
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
387 links = pagediv.findall(".//a")
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
388 # add our URL as backlink
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
389 selfurl = self.getLink()
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
390 doc = base64.b64encode(selfurl)
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
391 for l in links:
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
392 href = l.get('href')
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
393 if href:
506
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
394 if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
395 l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
67014399894d cleaned out all 4suite code and weird methods.
casties
parents: 501
diff changeset
396 l.set('target', '_blank')
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
397
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
398 return serialize(pagediv)
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
399
501
29c6d09a506c more cleanup.
casties
parents: 482
diff changeset
400 return None
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
401
551
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
402 def addPunditAttributes(self, pagediv, pageinfo, docinfo):
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
403 """add about attributes for pundit annotation tool"""
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
404 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
405 pn = pageinfo.get('pn', '1')
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
406 # TODO: use pn as well?
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
407 # check all div-tags
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
408 divs = pagediv.findall(".//div")
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
409 for d in divs:
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
410 id = d.get('id')
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
411 if id:
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
412 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
413 cls = d.get('class','')
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
414 cls += ' pundit-content'
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
415 d.set('class', cls.strip())
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
416
f558624d3f73 add attributes for pundit.
casties
parents: 544
diff changeset
417 return pagediv
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
418
509
9d05befdd462 try to get characterNormalization in search result working.
casties
parents: 508
diff changeset
419 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
420 """loads list of search results and stores XML in docinfo"""
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
421
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
422 logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
423 if mode == "none":
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
424 return docinfo
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
425
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
426 cachedQuery = docinfo.get('cachedQuery', None)
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
427 if cachedQuery is not None:
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
428 # cached search result
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
429 if cachedQuery == '%s_%s'%(mode,query):
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
430 # same query
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
431 return docinfo
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
432
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
433 else:
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
434 # different query
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
435 del docinfo['resultSize']
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
436 del docinfo['resultXML']
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
437
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
438 # cache query
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
439 docinfo['cachedQuery'] = '%s_%s'%(mode,query)
509
9d05befdd462 try to get characterNormalization in search result working.
casties
parents: 508
diff changeset
440
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
441 # fetch full results
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
442 docpath = docinfo['textURLPath']
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
443 params = {'document': docpath,
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
444 'mode': 'text',
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
445 'queryType': mode,
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
446 'query': query,
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
447 'queryResultPageSize': 1000,
509
9d05befdd462 try to get characterNormalization in search result working.
casties
parents: 508
diff changeset
448 'queryResultPN': 1,
9d05befdd462 try to get characterNormalization in search result working.
casties
parents: 508
diff changeset
449 'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
450 pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
451 #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
452 dom = ET.fromstring(pagexml)
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
453 # page content is in <div class="queryResultPage">
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
454 pagediv = None
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
455 # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
456 alldivs = dom.findall("div")
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
457 for div in alldivs:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
458 dc = div.get('class')
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
459 # page content div
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
460 if dc == 'queryResultPage':
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
461 pagediv = div
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
462
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
463 elif dc == 'queryResultHits':
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
464 docinfo['resultSize'] = getInt(div.text)
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
465
510
4fb35343d2e7 more search. nicer css.
casties
parents: 509
diff changeset
466 if pagediv is not None:
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
467 # store XML in docinfo
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
468 docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
469
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
470 return docinfo
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
471
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
472
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
473 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
474 """returns single page from the table of contents"""
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
475 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
516
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
476 # get (cached) result
7d7b639d7be7 add methods to use doc-info.xql.
casties
parents: 513
diff changeset
477 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
478
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
479 resultxml = docinfo.get('resultXML', None)
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
480 if not resultxml:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
481 logging.error("getResultPage: unable to find resultXML")
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
482 return "Error: no result!"
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
483
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
484 if size is None:
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
485 size = pageinfo.get('resultPageSize', 10)
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
486
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
487 if start is None:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
488 start = (pn - 1) * size
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
489
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
490 fullresult = ET.fromstring(resultxml)
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
491
510
4fb35343d2e7 more search. nicer css.
casties
parents: 509
diff changeset
492 if fullresult is not None:
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
493 # paginate
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
494 first = start-1
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
495 len = size
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
496 del fullresult[:first]
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
497 del fullresult[len:]
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
498 tocdivs = fullresult
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
499
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
500 # check all a-tags
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
501 links = tocdivs.findall(".//a")
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
502 for l in links:
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
503 href = l.get('href')
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
504 if href:
511
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
505 # assume all links go to pages
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
506 linkUrl = urlparse.urlparse(href)
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
507 linkParams = urlparse.parse_qs(linkUrl.query)
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
508 # take some parameters
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
509 params = {'pn': linkParams['pn'],
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
510 'highlightQuery': linkParams.get('highlightQuery',''),
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
511 'highlightElement': linkParams.get('highlightElement',''),
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
512 'highlightElementPos': linkParams.get('highlightElementPos','')
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
513 }
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
514 url = self.getLink(params=params)
551ca1641a5e more cleanup.
casties
parents: 510
diff changeset
515 l.set('href', url)
508
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
516
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
517 return serialize(tocdivs)
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
518
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
519 return "ERROR: no results!"
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
520
d5a47f82e755 more cleanup.
casties
parents: 506
diff changeset
521
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
522 def getToc(self, mode='text', docinfo=None):
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
523 """returns list of table of contents from docinfo"""
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
524 logging.debug("getToc mode=%s"%mode)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
525 if mode == 'text':
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
526 queryType = 'toc'
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
527 else:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
528 queryType = mode
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
529
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
530 if not 'full_%s'%queryType in docinfo:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
531 # get new toc
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
532 docinfo = self.getTextInfo(queryType, docinfo)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
533
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
534 return docinfo.get('full_%s'%queryType, [])
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
535
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
536 def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
537 """returns single page from the table of contents"""
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
538 logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
539 fulltoc = self.getToc(mode=mode, docinfo=docinfo)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
540 if len(fulltoc) < 1:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
541 logging.error("getTocPage: unable to find toc!")
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
542 return "Error: no table of contents!"
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
543
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
544 if size is None:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
545 size = pageinfo.get('tocPageSize', 30)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
546
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
547 if start is None:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
548 start = (pn - 1) * size
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
549
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
550 # paginate
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
551 first = (start - 1)
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
552 last = first + size
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
553 tocs = fulltoc[first:last]
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
554 tp = '<div>'
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
555 for toc in tocs:
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
556 pageurl = self.getLink('pn', toc['pn'])
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
557 tp += '<div class="tocline">'
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
558 tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content'])
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
559 tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
560 tp += '</div>\n'
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
561
519
9a3cc3732194 uses xml from doc-info.xql for table of contents now.
casties
parents: 518
diff changeset
562 tp += '</div>\n'
518
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
563
91051b36b9cc uses xml info from doc-info.xql for table of contents now.
casties
parents: 517
diff changeset
564 return tp
519
9a3cc3732194 uses xml from doc-info.xql for table of contents now.
casties
parents: 518
diff changeset
565
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
566
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
567 def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
568 """change settings"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
569 self.title=title
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
570 self.timeout = timeout
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
571 self.serverUrl = serverUrl
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
572 if RESPONSE is not None:
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
573 RESPONSE.redirect('manage_main')
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
574
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
575 # management methods
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
576 def manage_addMpdlXmlTextServerForm(self):
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
577 """Form for adding"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
578 pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
579 return pt()
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
580
453
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
581 def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
beb7ccb92564 first version using elementtree instead of 4suite xml
casties
parents: 407
diff changeset
582 #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):
129
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
583 """add zogiimage"""
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
584 newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
585 self.Destination()._setObject(id, newObj)
9404b6c37920 more modular version with separate object MpdlXmlTextServer
casties
parents:
diff changeset
586 if RESPONSE is not None:
455
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
587 RESPONSE.redirect('manage_main')
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
588
0a53fea83df7 more work renovating
casties
parents: 453
diff changeset
589