Context Navigation

source: documentViewer/documentViewer.py @ 617:7aefbddddaf9

Last change on this file since 617:7aefbddddaf9 was 617:7aefbddddaf9, checked in by dwinter, 10 years ago
alpaha of hocr server support
File size: 47.4 KB

Line
1	from OFS.Folder import Folder
2	from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
3	from Products.PageTemplates.PageTemplateFile import PageTemplateFile
4	from App.ImageFile import ImageFile
5	from AccessControl import ClassSecurityInfo
6	from AccessControl import getSecurityManager
7
8	import xml.etree.ElementTree as ET
9
10	import os
11	import urllib
12	import logging
13	import math
14	import urlparse
15	import json
16
17	from Products.MetaDataProvider import MetaDataFolder
18
19	from SrvTxtUtils import getInt, utf8ify, getText, getHttpData, refreshingImageFileIndexHtml
20
21
22	def getMDText(node):
23	"""returns the @text content from the MetaDataProvider metadata node"""
24
25
26
27	if isinstance(node, dict):
28	return node.get('@text', None)
29
30	if isinstance(node,list): #more than one text file if there is an attribute don't choose it
31	for nodeInList in node:
32	attr = nodeInList.get("@attr",None)
33	if attr is None:
34	return node.get('@text',None)
35	return None
36
37
38
39	return node
40
41	def getParentPath(path, cnt=1):
42	"""returns pathname shortened by cnt"""
43	# make sure path doesn't end with /
44	path = path.rstrip('/')
45	# split by /, shorten, and reassemble
46	return '/'.join(path.split('/')[0:-cnt])
47
48	def getPnForPf(docinfo, pf, default=0):
49	"""returns image number for image file name or default"""
50	if 'imgFileNames' in docinfo:
51	pn = docinfo['imgFileNames'].get(pf, None)
52	if pn is None:
53	# try to cut extension
54	xi = pf.rfind('.')
55	if xi > 0:
56	pf = pf[:xi]
57	# try again, else return 0
58	pn = docinfo['imgFileNames'].get(pf, default)
59	else:
60	# no extension
61	pn = default
62
63	return pn
64
65	return default
66
67	def getPfForPn(docinfo, pn, default=None):
68	"""returns image file name for image number or default"""
69	if 'imgFileIndexes' in docinfo:
70	pn = docinfo['imgFileIndexes'].get(pn, default)
71	return pn
72
73	return default
74
75
76	##
77	## documentViewer class
78	##
79	class documentViewer(Folder):
80	"""document viewer"""
81	meta_type="Document viewer"
82
83	security=ClassSecurityInfo()
84	manage_options=Folder.manage_options+(
85	{'label':'Configuration','action':'changeDocumentViewerForm'},
86	)
87
88	metadataService = None
89	"""MetaDataFolder instance"""
90
91
92	#
93	# templates and forms
94	#
95	# viewMode templates
96	viewer_text = PageTemplateFile('zpt/viewer/viewer_text', globals())
97	viewer_hocr = PageTemplateFile('zpt/viewer/viewer_hocr', globals())
98	viewer_xml = PageTemplateFile('zpt/viewer/viewer_xml', globals())
99	viewer_image = PageTemplateFile('zpt/viewer/viewer_image', globals())
100	viewer_index = PageTemplateFile('zpt/viewer/viewer_index', globals())
101	viewer_thumbs = PageTemplateFile('zpt/viewer/viewer_thumbs', globals())
102	viewer_indexonly = PageTemplateFile('zpt/viewer/viewer_indexonly', globals())
103	# available layer types (annotator not default)
104	builtinLayers = {'text': ['dict','search','gis'],
105	'xml': None, 'image': None, 'index': ['extended']}
106	availableLayers = builtinLayers;
107	# layer templates
108	layer_text_dict = PageTemplateFile('zpt/viewer/layer_text_dict', globals())
109	layer_text_search = PageTemplateFile('zpt/viewer/layer_text_search', globals())
110	layer_text_annotator = PageTemplateFile('zpt/viewer/layer_text_annotator', globals())
111	layer_text_gis = PageTemplateFile('zpt/viewer/layer_text_gis', globals())
112	layer_text_pundit = PageTemplateFile('zpt/viewer/layer_text_pundit', globals())
113	layer_image_annotator = PageTemplateFile('zpt/viewer/layer_image_annotator', globals())
114	layer_image_search = PageTemplateFile('zpt/viewer/layer_image_search', globals())
115	layer_index_extended = PageTemplateFile('zpt/viewer/layer_index_extended', globals())
116	# toc templates
117	toc_thumbs = PageTemplateFile('zpt/viewer/toc_thumbs', globals())
118	toc_text = PageTemplateFile('zpt/viewer/toc_text', globals())
119	toc_figures = PageTemplateFile('zpt/viewer/toc_figures', globals())
120	toc_concordance = PageTemplateFile('zpt/viewer/toc_concordance', globals())
121	toc_notes = PageTemplateFile('zpt/viewer/toc_notes', globals())
122	toc_handwritten = PageTemplateFile('zpt/viewer/toc_handwritten', globals())
123	toc_none = PageTemplateFile('zpt/viewer/toc_none', globals())
124	# other templates
125	common_template = PageTemplateFile('zpt/viewer/common_template', globals())
126	info_xml = PageTemplateFile('zpt/viewer/info_xml', globals())
127	docuviewer_css = ImageFile('css/docuviewer.css',globals())
128	# make docuviewer_css refreshable for development
129	docuviewer_css.index_html = refreshingImageFileIndexHtml
130	docuviewer_ie_css = ImageFile('css/docuviewer_ie.css',globals())
131	# make docuviewer_ie_css refreshable for development
132	#docuviewer_ie_css.index_html = refreshingImageFileIndexHtml
133	jquery_js = ImageFile('js/jquery.js',globals())
134
135
136	def __init__(self,id,imageScalerUrl=None,textServerName=None,title="",digilibBaseUrl=None,thumbcols=2,thumbrows=5,authgroups="mpiwg"):
137	"""init document viewer"""
138	self.id=id
139	self.title=title
140	self.thumbcols = thumbcols
141	self.thumbrows = thumbrows
142	# authgroups is list of authorized groups (delimited by ,)
143	self.authgroups = [s.strip().lower() for s in authgroups.split(',')]
144	# create template folder so we can always use template.something
145
146	templateFolder = Folder('template')
147	self['template'] = templateFolder # Zope-2.12 style
148	#self._setObject('template',templateFolder) # old style
149	try:
150	import MpdlXmlTextServer
151	textServer = MpdlXmlTextServer.MpdlXmlTextServer(id='fulltextclient',serverName=textServerName)
152	templateFolder['fulltextclient'] = textServer
153	#templateFolder._setObject('fulltextclient',textServer)
154	except Exception, e:
155	logging.error("Unable to create MpdlXmlTextServer for fulltextclient: "+str(e))
156
157	try:
158	from Products.zogiLib.zogiLib import zogiLib
159	zogilib = zogiLib(id="zogilib", title="zogilib for docuviewer", dlServerURL=imageScalerUrl, layout="book")
160	templateFolder['zogilib'] = zogilib
161	#templateFolder._setObject('zogilib',zogilib)
162	except Exception, e:
163	logging.error("Unable to create zogiLib for 'zogilib': "+str(e))
164
165	try:
166	# assume MetaDataFolder instance is called metadata
167	self.metadataService = getattr(self, 'metadata')
168	except Exception, e:
169	logging.error("Unable to find MetaDataFolder 'metadata': "+str(e))
170
171	if digilibBaseUrl is not None:
172	self.digilibBaseUrl = digilibBaseUrl
173	self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
174	self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
175
176
177	# proxy text server methods to fulltextclient
178	def getTextPage(self, **args):
179	"""returns full text content of page"""
180
181	return self.template.fulltextclient.getTextPage(**args)
182
183
184
185
186	def getSearchResults(self, **args):
187	"""loads list of search results and stores XML in docinfo"""
188	return self.template.fulltextclient.getSearchResults(**args)
189
190	def getResultsPage(self, **args):
191	"""returns one page of the search results"""
192	return self.template.fulltextclient.getResultsPage(**args)
193
194	def getTextInfo(self, **args):
195	"""returns document info from the text server"""
196	return self.template.fulltextclient.getTextInfo(**args)
197
198	def getToc(self, **args):
199	"""loads table of contents and stores XML in docinfo"""
200	return self.template.fulltextclient.getToc(**args)
201
202	def getTocPage(self, **args):
203	"""returns one page of the table of contents"""
204	return self.template.fulltextclient.getTocPage(**args)
205
206	def getRepositoryType(self, **args):
207	"""get repository type"""
208	return self.template.fulltextclient.getRepositoryType(**args)
209
210	def getTextDownloadUrl(self, **args):
211	"""get URL to download the full text"""
212	return self.template.fulltextclient.getTextDownloadUrl(**args)
213
214	def getPlacesOnPage(self, **args):
215	"""get list of gis places on one page"""
216	return self.template.fulltextclient.getPlacesOnPage(**args)
217
218	# Thumb list for CoolIris Plugin
219	thumbs_main_rss = PageTemplateFile('zpt/thumbs_main_rss', globals())
220	security.declareProtected('View','thumbs_rss')
221	def thumbs_rss(self,mode,url,viewMode="auto",start=None,pn=1):
222	'''
223	view it
224	@param mode: defines how to access the document behind url
225	@param url: url which contains display information
226	@param viewMode: image: display images, text: display text, default is auto (try text, else image)
227
228	'''
229
230	if not hasattr(self, 'template'):
231	# this won't work
232	logging.error("template folder missing!")
233	return "ERROR: template folder missing!"
234
235	if not self.digilibBaseUrl:
236	self.digilibBaseUrl = self.findDigilibUrl() or "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary"
237
238	docinfo = self.getDocinfo(mode=mode,url=url)
239	#pageinfo = self.getPageinfo(start=start,current=pn,docinfo=docinfo)
240	pageinfo = self.getPageinfo(start=start,pn=pn, docinfo=docinfo)
241	''' ZDES '''
242	pt = getattr(self.template, 'thumbs_main_rss')
243
244	if viewMode=="auto": # automodus gewaehlt
245	if docinfo.has_key("textURL") or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert
246	viewMode="text"
247	else:
248	viewMode="image"
249
250	return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode)
251
252
253	security.declareProtected('View','index_html')
254	def index_html(self, url, mode="texttool", viewMode="auto", viewLayer=None, tocMode=None, start=None, pn=None, pf=None):
255	"""
256	show page
257	@param url: url which contains display information
258	@param mode: defines how to access the document behind url
259	@param viewMode: 'image': display images, 'text': display text, 'xml': display xml, default is 'auto', 'hocr' : hocr format
260	@param viewLayer: sub-type of viewMode, e.g. layer 'dict' for viewMode='text'
261	@param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none)
262	"""
263
264	logging.debug("documentViewer(index_html) mode=%s url=%s viewMode=%s viewLayer=%s start=%s pn=%s pf=%s"%(mode,url,viewMode,viewLayer,start,pn,pf))
265
266	if not hasattr(self, 'template'):
267	# this won't work
268	logging.error("template folder missing!")
269	return "ERROR: template folder missing!"
270
271
272
273	if not getattr(self, 'digilibBaseUrl', None):
274	self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary"
275
276	# mode=filepath should not have toc-thumbs
277	if tocMode is None:
278	if mode == "filepath":
279	tocMode = "none"
280	else:
281	tocMode = "thumbs"
282
283	# docinfo: information about document (cached)
284	docinfo = self.getDocinfo(mode=mode,url=url,tocMode=tocMode)
285
286	# userinfo: user settings (cached)
287	userinfo = self.getUserinfo()
288
289	# auto viewMode: text if there is a text else images
290	if viewMode=="auto":
291	if docinfo.get('textURLPath', None):
292	# docinfo.get('textURL', None) not implemented yet
293	viewMode = "text"
294	if viewLayer is None and 'viewLayer' not in userinfo:
295	# use layer dict as default
296	viewLayer = "dict"
297	else:
298	viewMode = "image"
299
300	elif viewMode == "text_dict":
301	# legacy fix
302	viewMode = "text"
303	viewLayer = "dict"
304
305	elif viewMode == 'images':
306	# legacy fix
307	viewMode = 'image'
308	self.REQUEST['viewMode'] = 'image'
309
310
311
312
313	# safe viewLayer in userinfo
314	userinfo['viewLayer'] = viewLayer
315
316	# pageinfo: information about page (not cached)
317	pageinfo = self.getPageinfo(start=start, pn=pn, pf=pf, docinfo=docinfo, userinfo=userinfo, viewMode=viewMode, viewLayer=viewLayer, tocMode=tocMode)
318
319	# get template /template/viewer_$viewMode
320	pt = getattr(self.template, 'viewer_%s'%viewMode, None)
321	if pt is None:
322	logging.error("No template for viewMode=%s!"%viewMode)
323	# TODO: error page?
324	return "No template for viewMode=%s!"%viewMode
325
326	# and execute with parameters
327	return pt(docinfo=docinfo, pageinfo=pageinfo)
328
329	def getAvailableLayers(self):
330	"""returns dict with list of available layers per viewMode"""
331	return self.availableLayers
332
333	def findDigilibUrl(self):
334	"""try to get the digilib URL from zogilib"""
335	url = self.template.zogilib.getDLBaseUrl()
336	return url
337
338	def getScalerUrl(self, fn=None, pn=None, dw=100, dh=100, docinfo=None):
339	"""returns URL to digilib Scaler with params"""
340	url = None
341	if docinfo is not None:
342	url = docinfo.get('imageURL', None)
343
344	if url is None:
345	url = self.digilibScalerUrl
346	if fn is None and docinfo is not None:
347	fn = docinfo.get('imagePath','')
348
349	url += "fn=%s"%fn
350
351	if pn:
352	url += "&pn=%s"%pn
353
354	url += "&dw=%s&dh=%s"%(dw,dh)
355	return url
356
357	def getDocumentViewerURL(self):
358	"""returns the URL of this instance"""
359	return self.absolute_url()
360
361	def getStyle(self, idx, selected, style=""):
362	"""returns a string with the given style and append 'sel' if idx == selected."""
363	#logger("documentViewer (getstyle)", logging.INFO, "idx: %s selected: %s style: %s"%(idx,selected,style))
364	if idx == selected:
365	return style + 'sel'
366	else:
367	return style
368
369	def getParams(self, param=None, val=None, params=None, duplicates=None):
370	"""returns dict with URL parameters.
371
372	Takes URL parameters and additionally param=val or dict params.
373	Deletes key if value is None."""
374	# copy existing request params
375	newParams=self.REQUEST.form.copy()
376	# change single param
377	if param is not None:
378	if val is None:
379	if newParams.has_key(param):
380	del newParams[param]
381	else:
382	newParams[param] = str(val)
383
384	# change more params
385	if params is not None:
386	for (k, v) in params.items():
387	if v is None:
388	# val=None removes param
389	if newParams.has_key(k):
390	del newParams[k]
391
392	else:
393	newParams[k] = v
394
395	if duplicates:
396	# eliminate lists (coming from duplicate keys)
397	for (k,v) in newParams.items():
398	if isinstance(v, list):
399	if duplicates == 'comma':
400	# make comma-separated list of non-empty entries
401	newParams[k] = ','.join([t for t in v if t])
402	elif duplicates == 'first':
403	# take first non-empty entry
404	newParams[k] = [t for t in v if t][0]
405
406	return newParams
407
408	def getLink(self, param=None, val=None, params=None, baseUrl=None, paramSep='&', duplicates='comma'):
409	"""returns URL to documentviewer with parameter param set to val or from dict params"""
410	urlParams = self.getParams(param=param, val=val, params=params, duplicates=duplicates)
411	# quote values and assemble into query string (not escaping '/')
412	ps = paramSep.join(["%s=%s"%(k, urllib.quote_plus(utf8ify(v), '/')) for (k, v) in urlParams.items()])
413	if baseUrl is None:
414	baseUrl = self.getDocumentViewerURL()
415
416	url = "%s?%s"%(baseUrl, ps)
417	return url
418
419	def getLinkAmp(self, param=None, val=None, params=None, baseUrl=None, duplicates='comma'):
420	"""link to documentviewer with parameter param set to val"""
421	return self.getLink(param=param, val=val, params=params, baseUrl=baseUrl, paramSep='&', duplicates=duplicates)
422
423
424	def setAvailableLayers(self, newLayerString=None):
425	"""sets availableLayers to newLayerString or tries to autodetect available layers.
426	assumes layer templates have the form layer_{m}_{l} for layer l in mode m.
427	newLayerString is parsed as JSON."""
428	if newLayerString is not None:
429	try:
430	layers = json.loads(newLayerString)
431	if 'text' in layers and 'image' in layers:
432	self.availableLayers = layers
433	return
434	except:
435	pass
436
437	logging.error("invalid layers=%s! autodetecting..."%repr(newLayerString))
438
439	# start with builtin layers
440	self.availableLayers = self.builtinLayers.copy()
441	# add layers from templates
442	for t in self.template:
443	if t.startswith('layer_'):
444	try:
445	(x, m, l) = t.split('_', 3)
446	if m not in self.availableLayers:
447	# mode m doesn't exist -> new list
448	self.availableLayers[m] = [l]
449
450	else:
451	# m exists -> append
452	if l not in self.availableLayers[m]:
453	self.availableLayers[m].append()
454
455	except:
456	pass
457
458	def getAvailableLayersJson(self):
459	"""returns available layers as JSON string."""
460	return json.dumps(self.availableLayers)
461
462
463	def getInfo_xml(self,url,mode):
464	"""returns info about the document as XML"""
465	if not self.digilibBaseUrl:
466	self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary"
467
468	docinfo = self.getDocinfo(mode=mode,url=url)
469	pt = getattr(self.template, 'info_xml')
470	return pt(docinfo=docinfo)
471
472	def getAuthenticatedUser(self, anon=None):
473	"""returns the authenticated user object or None. (ignores Zopes anonymous user)"""
474	user = getSecurityManager().getUser()
475	if user is not None and user.getUserName() != "Anonymous User":
476	return user
477	else:
478	return anon
479
480	def isAccessible(self, docinfo):
481	"""returns if access to the resource is granted"""
482	access = docinfo.get('accessType', None)
483	logging.debug("documentViewer (accessOK) access type %s"%access)
484	if access == 'free':
485	logging.debug("documentViewer (accessOK) access is free")
486	return True
487
488	elif access is None or access in self.authgroups:
489	# only local access -- only logged in users
490	user = self.getAuthenticatedUser()
491	logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr()))
492	return (user is not None)
493
494	logging.error("documentViewer (accessOK) unknown access type %s"%access)
495	return False
496
497	def getUserinfo(self):
498	"""returns userinfo object"""
499	logging.debug("getUserinfo")
500	userinfo = {}
501	# look for cached userinfo in session
502	if self.REQUEST.SESSION.has_key('userinfo'):
503	userinfo = self.REQUEST.SESSION['userinfo']
504	# check if its still current?
505	else:
506	# store in session
507	self.REQUEST.SESSION['userinfo'] = userinfo
508
509	return userinfo
510
511	def getDocinfoJSON(self, mode, url, tocMode=None):
512	"""returns docinfo depending on mode"""
513	import json
514
515	dc = self.getDocinfo( mode, url, tocMode)
516
517	return json.dumps(dc)
518
519
520	def getDocinfo(self, mode, url, tocMode=None):
521	"""returns docinfo depending on mode"""
522	logging.debug("getDocinfo: mode=%s, url=%s"%(mode,url))
523	# look for cached docinfo in session
524	if self.REQUEST.SESSION.has_key('docinfo'):
525	docinfo = self.REQUEST.SESSION['docinfo']
526	# check if its still current
527	if docinfo is not None and docinfo.get('mode', None) == mode and docinfo.get('url', None) == url:
528	logging.debug("getDocinfo: docinfo in session. keys=%s"%docinfo.keys())
529	return docinfo
530
531	# new docinfo
532	docinfo = {'mode': mode, 'url': url}
533	# add self url
534	docinfo['viewerUrl'] = self.getDocumentViewerURL()
535	docinfo['digilibBaseUrl'] = self.digilibBaseUrl
536	docinfo['digilibScalerUrl'] = self.digilibScalerUrl
537	docinfo['digilibViewerUrl'] = self.digilibViewerUrl
538	# get index.meta DOM
539	docUrl = None
540	metaDom = None
541	if mode=="texttool":
542	# url points to document dir or index.meta
543	metaDom = self.metadataService.getDomFromPathOrUrl(url)
544	if metaDom is None:
545	raise IOError("Unable to find index.meta for mode=texttool!")
546
547	docUrl = url.replace('/index.meta', '')
548	if url.startswith('/mpiwg/online/'):
549	docUrl = url.replace('/mpiwg/online/', '', 1)
550
551	elif mode=="imagepath":
552	# url points to folder with images, index.meta optional
553	# asssume index.meta in parent dir
554	docUrl = getParentPath(url)
555	metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
556	docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
557
558	elif mode=="hocr":
559	# url points to folder with images, index.meta optional
560	# asssume index.meta in parent dir
561	docUrl = getParentPath(url)
562	metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
563	docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
564	docinfo['textURLPath'] = url.replace('/mpiwg/online', '', 1)
565	if docinfo.get("creator", None) is None:
566	docinfo['creator'] = ""
567
568	if docinfo.get("title", None) is None:
569	docinfo['title'] = ""
570
571	if docinfo.get("documentPath", None) is None:
572	docinfo['documentPath'] = url.replace('/mpiwg/online', '', 1)
573	docinfo['documentPath'] = url.replace('/pages', '', 1)
574
575	elif mode=="filepath":
576	# url points to image file, index.meta optional
577	docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, url)
578	docinfo['numPages'] = 1
579	# asssume index.meta is two path segments up
580	docUrl = getParentPath(url, 2)
581	metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
582
583	else:
584	logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode)
585	raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode))
586
587	docinfo['documentUrl'] = docUrl
588	# process index.meta contents
589	if metaDom is not None and metaDom.tag == 'resource':
590	# document directory name and path
591	resource = self.metadataService.getResourceData(dom=metaDom, recursive=1)
592	if resource:
593	docinfo = self.getDocinfoFromResource(docinfo, resource)
594
595	# texttool info
596	texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True)
597	if texttool:
598	docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
599	# document info from full text server
600	if docinfo.get('textURLPath', None):
601	docinfo = self.getTextInfo(mode=None, docinfo=docinfo)
602	# include list of pages TODO: do we need this always?
603	docinfo = self.getTextInfo(mode='pages', docinfo=docinfo)
604
605	# bib info
606	bib = self.metadataService.getBibData(dom=metaDom)
607	if bib:
608	# save extended version as 'bibx' TODO: ugly
609	bibx = self.metadataService.getBibData(dom=metaDom, all=True, recursive=1)
610	if len(bibx) == 1:
611	# unwrap list if possible
612	bibx = bibx[0]
613
614	docinfo['bibx'] = bibx
615	docinfo = self.getDocinfoFromBib(docinfo, bib, bibx)
616	else:
617	# no bib - try info.xml
618	docinfo = self.getDocinfoFromPresentationInfoXml(docinfo)
619
620	# auth info
621	access = self.metadataService.getAccessData(dom=metaDom)
622	if access:
623	docinfo = self.getDocinfoFromAccess(docinfo, access)
624
625	# attribution info
626	attribution = self.metadataService.getAttributionData(dom=metaDom)
627	if attribution:
628	logging.debug("getDocinfo: attribution=%s"%repr(attribution))
629	docinfo['attribution'] = attribution
630
631	# copyright info
632	copyright = self.metadataService.getCopyrightData(dom=metaDom)
633	if copyright:
634	logging.debug("getDocinfo: copyright=%s"%repr(copyright))
635	docinfo['copyright'] = copyright
636
637	# DRI (permanent ID)
638	dri = self.metadataService.getDRI(dom=metaDom, type='mpiwg')
639	if dri:
640	docinfo['DRI'] = dri
641
642	# (presentation) context
643	ctx = self.metadataService.getContextData(dom=metaDom, all=True)
644	if ctx:
645	logging.debug("getcontext: ctx=%s"%repr(ctx))
646	docinfo['presentationContext'] = ctx
647
648	# image path
649	if mode != 'texttool':
650	# override image path from texttool with url parameter TODO: how about mode=auto?
651	docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
652
653	# check numPages
654	if docinfo.get('numPages', 0) == 0:
655	# number of images from digilib
656	if docinfo.get('imagePath', None):
657	imgpath = docinfo['imagePath'].replace('/mpiwg/online', '', 1)
658	logging.debug("imgpath=%s"%imgpath)
659	docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, imgpath)
660	docinfo = self.getDocinfoFromDigilib(docinfo, imgpath)
661	else:
662	# imagePath still missing? try "./pageimg"
663	imgPath = os.path.join(docUrl, 'pageimg')
664	docinfo = self.getDocinfoFromDigilib(docinfo, imgPath)
665	if docinfo.get('numPages', 0) > 0:
666	# there are pages
667	docinfo['imagePath'] = imgPath
668	docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, docinfo['imagePath'])
669
670	# check numPages
671	if docinfo.get('numPages', 0) == 0:
672	if docinfo.get('numTextPages', 0) > 0:
673	# replace with numTextPages (text-only?)
674	docinfo['numPages'] = docinfo['numTextPages']
675
676	# min and max page no
677	docinfo['minPageNo'] = docinfo.get('minPageNo', 1)
678	docinfo['maxPageNo'] = docinfo.get('maxPageNo', docinfo['numPages'])
679
680	# part-of information
681	partOfPath = docinfo.get('partOfPath', None)
682	if partOfPath is not None:
683	partOfDom = self.metadataService.getDomFromPathOrUrl(partOfPath)
684	if partOfDom is not None:
685	docinfo['partOfLabel'] = self.metadataService.getBibFormattedLabel(dom=partOfDom)
686	docinfo['partOfUrl'] = "%s?url=%s"%(self.getDocumentViewerURL(), partOfPath)
687	logging.debug("partOfLabel=%s partOfUrl=%s"%(docinfo['partOfLabel'],docinfo['partOfUrl']))
688
689	# normalize path
690	if 'imagePath' in docinfo and not docinfo['imagePath'].startswith('/'):
691	docinfo['imagePath'] = '/' + docinfo['imagePath']
692
693	logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
694	# store in session
695	self.REQUEST.SESSION['docinfo'] = docinfo
696	return docinfo
697
698
699	def getDocinfoFromResource(self, docinfo, resource):
700	"""reads contents of resource element into docinfo"""
701	logging.debug("getDocinfoFromResource: resource=%s"%(repr(resource)))
702	docName = getMDText(resource.get('name', None))
703	docinfo['documentName'] = docName
704	docPath = getMDText(resource.get('archive-path', None))
705	if docPath:
706	# clean up document path
707	if docPath[0] != '/':
708	docPath = '/' + docPath
709
710	if docName and (not docPath.endswith(docName)):
711	docPath += "/" + docName
712
713	else:
714	# use docUrl as docPath
715	docUrl = docinfo['documentURL']
716	if not docUrl.startswith('http:'):
717	docPath = docUrl
718
719	if docPath:
720	# fix URLs starting with /mpiwg/online
721	docPath = docPath.replace('/mpiwg/online', '', 1)
722
723	docinfo['documentPath'] = docPath
724
725	# is this part-of?
726	partOf = resource.get('is-part-of', None)
727	if partOf is not None:
728	partOf = getMDText(partOf.get('archive-path', None))
729	if partOf is not None:
730	docinfo['partOfPath'] = partOf.strip()
731
732	return docinfo
733
734	def getDocinfoFromTexttool(self, docinfo, texttool):
735	"""reads contents of texttool element into docinfo"""
736	logging.debug("texttool=%s"%repr(texttool))
737	# unpack list if necessary
738	if isinstance(texttool, list):
739	texttool = texttool[0]
740
741	# image dir
742	imageDir = getMDText(texttool.get('image', None))
743	docPath = getMDText(docinfo.get('documentPath', None))
744	if imageDir:
745	if imageDir.startswith('/'):
746	# absolute path
747	imageDir = imageDir.replace('/mpiwg/online', '', 1)
748	docinfo['imagePath'] = imageDir
749
750	elif docPath:
751	# relative path
752	imageDir = os.path.join(docPath, imageDir)
753	imageDir = imageDir.replace('/mpiwg/online', '', 1)
754	docinfo['imagePath'] = imageDir
755
756	# start and end page (for subdocuments of other documents)
757	imgStartNo = getMDText(texttool.get('image-start-no', None))
758	minPageNo = getInt(imgStartNo, 1)
759	docinfo['minPageNo'] = minPageNo
760
761	imgEndNo = getMDText(texttool.get('image-end-no', None))
762	if imgEndNo:
763	docinfo['maxPageNo'] = getInt(imgEndNo)
764
765	# old style text URL
766	textUrl = getMDText(texttool.get('text', None))
767
768
769
770
771	if textUrl and docPath:
772	if urlparse.urlparse(textUrl)[0] == "": #keine url
773	textUrl = os.path.join(docPath, textUrl)
774
775	docinfo['textURL'] = textUrl
776
777	# new style text-url-path (can be more than one with "repository" attribute)
778	textUrlNode = texttool.get('text-url-path', None)
779	if not isinstance(textUrlNode, list):
780	textUrlNode = [textUrlNode]
781
782	for tun in textUrlNode:
783	textUrl = getMDText(tun)
784	if textUrl:
785	textUrlAtts = tun.get('@attr')
786	if (textUrlAtts and 'repository' in textUrlAtts):
787	textRepo = textUrlAtts['repository']
788	# use matching repository
789	if self.getRepositoryType() == textRepo:
790	docinfo['textURLPath'] = textUrl
791	docinfo['textURLRepository'] = textRepo
792	break
793
794	else:
795	# no repo attribute - use always
796	docinfo['textURLPath'] = textUrl
797
798	# page flow
799	docinfo['pageFlow'] = getMDText(texttool.get('page-flow', 'ltr'))
800
801	# odd pages are left
802	docinfo['oddPage'] = getMDText(texttool.get('odd-scan-position', 'left'))
803
804	# number of title page (default 1)
805	docinfo['titlePage'] = getMDText(texttool.get('title-scan-no', minPageNo))
806
807	# old presentation stuff
808	presentation = getMDText(texttool.get('presentation', None))
809	if presentation and docPath:
810	if presentation.startswith('http:'):
811	docinfo['presentationUrl'] = presentation
812	else:
813	docinfo['presentationUrl'] = os.path.join(docPath, presentation)
814
815	# make sure we have at least fake DC data
816	if 'creator' not in docinfo:
817	docinfo['creator'] = '[no author found]'
818
819	if 'title' not in docinfo:
820	docinfo['title'] = '[no title found]'
821
822	if 'date' not in docinfo:
823	docinfo['date'] = '[no date found]'
824
825	return docinfo
826
827	def getDocinfoFromBib(self, docinfo, bib, bibx=None):
828	"""reads contents of bib element into docinfo"""
829	logging.debug("getDocinfoFromBib bib=%s"%repr(bib))
830	# put all raw bib fields in dict "bib"
831	docinfo['bib'] = bib
832	bibtype = bib.get('@type', None)
833	docinfo['bibType'] = bibtype
834	# also store DC metadata for convenience
835	dc = self.metadataService.getDCMappedData(bib)
836	docinfo['creator'] = dc.get('creator','')
837	docinfo['title'] = dc.get('title','')
838	docinfo['date'] = dc.get('date','')
839	return docinfo
840
841	def getDocinfoFromAccess(self, docinfo, acc):
842	"""reads contents of access element into docinfo"""
843	#TODO: also read resource type
844	logging.debug("getDocinfoFromAccess acc=%s"%repr(acc))
845	try:
846	acctype = acc['@attr']['type']
847	if acctype:
848	access=acctype
849	if access in ['group', 'institution']:
850	access = acc['name'].lower()
851
852	docinfo['accessType'] = access
853
854	except:
855	pass
856
857	return docinfo
858
859	def getDocinfoFromDigilib(self, docinfo, path):
860	infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?fn="+path
861	# fetch data
862	txt = getHttpData(infoUrl)
863	if not txt:
864	logging.error("Unable to get dir-info from %s"%(infoUrl))
865	return docinfo
866
867	dom = ET.fromstring(txt)
868	dir = dom
869	# save size
870	size = dir.findtext('size')
871	logging.debug("getDocinfoFromDigilib: size=%s"%size)
872	if size:
873	docinfo['numPages'] = int(size)
874	else:
875	docinfo['numPages'] = 0
876	return docinfo
877
878	# save list of image names and numbers
879	imgNames = {}
880	imgIndexes = {}
881	for f in dir:
882	fn = f.findtext('name')
883	pn = getInt(f.findtext('index'))
884	imgNames[fn] = pn
885	imgIndexes[pn] = fn
886
887	docinfo['imgFileNames'] = imgNames
888	docinfo['imgFileIndexes'] = imgIndexes
889	return docinfo
890
891
892	def getDocinfoFromPresentationInfoXml(self,docinfo):
893	"""gets DC-like bibliographical information from the presentation entry in texttools"""
894	url = docinfo.get('presentationUrl', None)
895	if not url:
896	logging.error("getDocinfoFromPresentation: no URL!")
897	return docinfo
898
899	dom = None
900	metaUrl = None
901	if url.startswith("http://"):
902	# real URL
903	metaUrl = url
904	else:
905	# online path
906	server=self.digilibBaseUrl+"/servlet/Texter?fn="
907	metaUrl=server+url
908
909	txt=getHttpData(metaUrl)
910	if txt is None:
911	logging.error("Unable to read info.xml from %s"%(url))
912	return docinfo
913
914	dom = ET.fromstring(txt)
915	docinfo['creator']=getText(dom.find(".//author"))
916	docinfo['title']=getText(dom.find(".//title"))
917	docinfo['date']=getText(dom.find(".//date"))
918	return docinfo
919
920
921	def getPageinfo(self, pn=None, pf=None, start=None, rows=None, cols=None, docinfo=None, userinfo=None, viewMode=None, viewLayer=None, tocMode=None):
922	"""returns pageinfo with the given parameters"""
923	logging.debug("getPageInfo(pn=%s, pf=%s, start=%s, rows=%s, cols=%s, viewMode=%s, viewLayer=%s, tocMode=%s)"%(pn,pf,start,rows,cols,viewMode,viewLayer,tocMode))
924	pageinfo = {}
925	pageinfo['viewMode'] = viewMode
926	# split viewLayer if necessary
927	if isinstance(viewLayer,basestring):
928	viewLayer = viewLayer.split(',')
929
930	if isinstance(viewLayer, list):
931	logging.debug("getPageinfo: viewLayer is list:%s"%viewLayer)
932	# save (unique) list in viewLayers
933	seen = set()
934	viewLayers = [l for l in viewLayer if l and l not in seen and not seen.add(l)]
935	pageinfo['viewLayers'] = viewLayers
936	# stringify viewLayer
937	viewLayer = ','.join(viewLayers)
938	else:
939	#create list
940	pageinfo['viewLayers'] = [viewLayer]
941
942	pageinfo['viewLayer'] = viewLayer
943	pageinfo['tocMode'] = tocMode
944
945	minPageNo = docinfo.get('minPageNo', 1)
946
947	# pf takes precedence over pn
948	if pf:
949	pageinfo['pf'] = pf
950	pn = getPnForPf(docinfo, pf)
951	# replace pf in request params (used for creating new URLs)
952	self.REQUEST.form.pop('pf', None)
953	self.REQUEST.form['pn'] = pn
954	else:
955	pn = getInt(pn, minPageNo)
956	pf = getPfForPn(docinfo, pn)
957	pageinfo['pf'] = pf
958
959	pageinfo['pn'] = pn
960	rows = int(rows or self.thumbrows)
961	pageinfo['rows'] = rows
962	cols = int(cols or self.thumbcols)
963	pageinfo['cols'] = cols
964	grpsize = cols * rows
965	pageinfo['groupsize'] = grpsize
966	# if start is empty use one around pn
967	grouppn = math.ceil(float(pn)/float(grpsize))*grpsize-(grpsize-1)
968	# but not smaller than minPageNo
969	start = getInt(start, max(grouppn, minPageNo))
970	pageinfo['start'] = start
971	# get number of pages
972	numPages = int(docinfo.get('numPages', 0))
973	if numPages == 0:
974	# try numTextPages
975	numPages = docinfo.get('numTextPages', 0)
976	if numPages != 0:
977	docinfo['numPages'] = numPages
978
979	maxPageNo = docinfo.get('maxPageNo', numPages)
980	logging.debug("minPageNo=%s maxPageNo=%s start=%s numPages=%s"%(minPageNo,maxPageNo,start,numPages))
981	np = maxPageNo
982
983	# cache table of contents
984	pageinfo['tocPageSize'] = getInt(self.REQUEST.get('tocPageSize', 30))
985	pageinfo['numgroups'] = int(np / grpsize)
986	if np % grpsize > 0:
987	pageinfo['numgroups'] += 1
988
989	pageFlowLtr = docinfo.get('pageFlow', 'ltr') != 'rtl'
990	oddScanLeft = docinfo.get('oddPage', 'left') != 'right'
991	# add zeroth page for two columns
992	pageZero = (cols == 2 and (pageFlowLtr != oddScanLeft))
993	pageinfo['pageZero'] = pageZero
994	pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=minPageNo, maxIdx=np)
995	# more page parameters
996	pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')
997	if docinfo.get('pageNumbers'):
998	# get original page numbers
999	pageNumber = docinfo['pageNumbers'].get(pn, None)
1000	if pageNumber is not None:
1001	pageinfo['pageNumberOrig'] = pageNumber['no']
1002	pageinfo['pageNumberOrigNorm'] = pageNumber['non']
1003
1004	# cache search results
1005	query = self.REQUEST.get('query',None)
1006	pageinfo['query'] = query
1007	if query and viewMode == 'text':
1008	pageinfo['resultPageSize'] = getInt(self.REQUEST.get('resultPageSize', 10))
1009	queryType = self.REQUEST.get('queryType', 'fulltextMorph')
1010	pageinfo['queryType'] = queryType
1011	pageinfo['resultStart'] = getInt(self.REQUEST.get('resultStart', '1'))
1012	self.getSearchResults(mode=queryType, query=query, pageinfo=pageinfo, docinfo=docinfo)
1013
1014	# highlighting
1015	highlightQuery = self.REQUEST.get('highlightQuery', None)
1016	if highlightQuery:
1017	pageinfo['highlightQuery'] = highlightQuery
1018	pageinfo['highlightElement'] = self.REQUEST.get('highlightElement', '')
1019	pageinfo['highlightElementPos'] = self.REQUEST.get('highlightElementPos', '')
1020
1021	return pageinfo
1022
1023
1024	def getPageBatch(self, start=1, rows=10, cols=2, pageFlowLtr=True, pageZero=False, minIdx=1, maxIdx=0):
1025	"""Return dict with array of page information for one screenfull of thumbnails.
1026
1027	:param start: index of current page
1028	:param rows: number of rows in one batch
1029	:param cols: number of columns in one batch
1030	:param pageFlowLtr: do indexes increase from left to right
1031	:param pageZero: is there a zeroth non-visible page
1032	:param minIdx: minimum index to use
1033	:param maxIdx: maximum index to use
1034	:returns: dict with
1035	first: first page index
1036	last: last page index
1037	batches: list of all possible batches(dict: 'start': index, 'end': index)
1038	pages: list for current batch of rows(list of cols(list of pages(dict: 'idx': index)))
1039	nextStart: first index of next batch
1040	prevStart: first index of previous batch
1041	"""
1042	logging.debug("getPageBatch start=%s minIdx=%s maxIdx=%s"%(start,minIdx,maxIdx))
1043	batch = {}
1044	grpsize = rows * cols
1045	if maxIdx == 0:
1046	maxIdx = start + grpsize
1047
1048	np = maxIdx - minIdx + 1
1049	if pageZero:
1050	# correct number of pages for batching
1051	np += 1
1052
1053	nb = int(math.ceil(np / float(grpsize)))
1054
1055	# list of all batch start and end points
1056	batches = []
1057	if pageZero:
1058	ofs = minIdx - 1
1059	else:
1060	ofs = minIdx
1061
1062	for i in range(nb):
1063	s = i * grpsize + ofs
1064	e = min((i + 1) * grpsize + ofs - 1, maxIdx)
1065	batches.append({'start':s, 'end':e})
1066
1067	batch['batches'] = batches
1068
1069	# list of pages for current screen
1070	pages = []
1071	if pageZero and start == minIdx:
1072	# correct beginning
1073	idx = minIdx - 1
1074	else:
1075	idx = start
1076
1077	for r in range(rows):
1078	row = []
1079	for c in range(cols):
1080	if idx < minIdx or idx > maxIdx:
1081	page = {'idx':None}
1082	else:
1083	page = {'idx':idx}
1084
1085	idx += 1
1086	if pageFlowLtr:
1087	row.append(page)
1088	else:
1089	row.insert(0, page)
1090
1091	pages.append(row)
1092
1093	if start > minIdx:
1094	batch['prevStart'] = max(start - grpsize, minIdx)
1095	else:
1096	batch['prevStart'] = None
1097
1098	if start + grpsize <= maxIdx:
1099	if pageZero and start == minIdx:
1100	# correct nextStart for pageZero
1101	batch['nextStart'] = grpsize
1102	else:
1103	batch['nextStart'] = start + grpsize
1104	else:
1105	batch['nextStart'] = None
1106
1107	batch['pages'] = pages
1108	batch['first'] = minIdx
1109	batch['last'] = maxIdx
1110	logging.debug("batch: %s"%repr(batch))
1111	return batch
1112
1113
1114	def getBatch(self, start=1, size=10, end=0, data=None, fullData=True):
1115	"""returns dict with information for one screenfull of data."""
1116	batch = {}
1117	if end == 0:
1118	end = start + size
1119
1120	nb = int(math.ceil(end / float(size)))
1121	# list of all batch start and end points
1122	batches = []
1123	for i in range(nb):
1124	s = i * size + 1
1125	e = min((i + 1) * size, end)
1126	batches.append({'start':s, 'end':e})
1127
1128	batch['batches'] = batches
1129	# list of elements in this batch
1130	this = []
1131	j = 0
1132	for i in range(start, min(start+size, end+1)):
1133	if data:
1134	if fullData:
1135	d = data.get(i, None)
1136	else:
1137	d = data.get(j, None)
1138	j += 1
1139
1140	else:
1141	d = i+1
1142
1143	this.append(d)
1144
1145	batch['this'] = this
1146	if start > 1:
1147	batch['prevStart'] = max(start - size, 1)
1148	else:
1149	batch['prevStart'] = None
1150
1151	if start + size < end:
1152	batch['nextStart'] = start + size
1153	else:
1154	batch['nextStart'] = None
1155
1156	batch['first'] = start
1157	batch['last'] = end
1158	return batch
1159
1160
1161	def getAnnotatorGroupsForUser(self, user, annotationServerUrl="http://tuxserve03.mpiwg-berlin.mpg.de/AnnotationManager"):
1162	"""returns list of groups {name:, id:} on the annotation server for the user"""
1163	groups = []
1164	groupsUrl = "%s/annotator/groups?user=%s"%(annotationServerUrl,user)
1165	data = getHttpData(url=groupsUrl, noExceptions=True)
1166	if data:
1167	res = json.loads(data)
1168	rows = res.get('rows', None)
1169	if rows is None:
1170	return groups
1171	for r in rows:
1172	groups.append({'id': r.get('id', None), 'name': r.get('name', None), 'uri': r.get('uri', None)})
1173
1174	return groups
1175
1176
1177	security.declareProtected('View management screens','changeDocumentViewerForm')
1178	changeDocumentViewerForm = PageTemplateFile('zpt/changeDocumentViewer', globals())
1179
1180	def changeDocumentViewer(self,title="",digilibBaseUrl=None,thumbrows=2,thumbcols=5,authgroups='mpiwg',availableLayers=None,RESPONSE=None):
1181	"""init document viewer"""
1182	self.title=title
1183	self.digilibBaseUrl = digilibBaseUrl
1184	self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
1185	self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
1186	self.thumbrows = thumbrows
1187	self.thumbcols = thumbcols
1188	self.authgroups = [s.strip().lower() for s in authgroups.split(',')]
1189	try:
1190	# assume MetaDataFolder instance is called metadata
1191	self.metadataService = getattr(self, 'metadata')
1192	except Exception, e:
1193	logging.error("Unable to find MetaDataFolder 'metadata': "+str(e))
1194
1195	self.setAvailableLayers(availableLayers)
1196
1197	if RESPONSE is not None:
1198	RESPONSE.redirect('manage_main')
1199
1200	def manage_AddDocumentViewerForm(self):
1201	"""add the viewer form"""
1202	pt=PageTemplateFile('zpt/addDocumentViewer', globals()).__of__(self)
1203	return pt()
1204
1205	def manage_AddDocumentViewer(self,id,imageScalerUrl="",textServerName="",title="",RESPONSE=None):
1206	"""add the viewer"""
1207	newObj=documentViewer(id,imageScalerUrl=imageScalerUrl,title=title,textServerName=textServerName)
1208	self._setObject(id,newObj)
1209
1210	if RESPONSE is not None:
1211	RESPONSE.redirect('manage_main')

Note: See TracBrowser for help on using the repository browser.

Download in other formats: