1: """Metadata search interface
2: ROC 2004, itgroup
3:
4: """
5:
6: from AccessControl import ClassSecurityInfo
7: from Globals import InitializeClass
8: from Globals import Persistent, package_home
9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
10: from Products.PageTemplates.PageTemplate import PageTemplate
11: from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
12: from OFS.Folder import Folder
13: from OFS.SimpleItem import SimpleItem
14: try:
15: import psycopg2 as PgSQL
16: except:
17: try:
18: import psycopg as PgSQL
19: except:
20: from pyPgSQL import PgSQL
21:
22: import re
23: import os
24:
25: MAXHITS = 1000
26:
27: class OSAS_search(Folder):
28: """Object for global metadata search"""
29:
30: meta_type="OSAS_search"
31:
32: manage_options=Folder.manage_options+(
33: {'label':'Main config','action':'manage_ChangeOSAS_searchForm'},
34: )
35:
36:
37: def __init__(self,id,title,dsn=None):
38: """init"""
39: self.id=id
40: self.title=title
41: if dsn:
42: self.dsn = dsn
43: else:
44: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
45: # volatile database connection object
46: self._v_dbCon = None
47: self._v_tryCon = 0
48:
49:
50: def dbCursor(self):
51: """returns new SQL cursor object"""
52: curs = None
53: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
54: try:
55: curs = self._v_dbCon.cursor()
56: self._v_tryCon = 0
57: except:
58: # in case of problems reset dbCon
59: self._v_dbCon = None
60: self._v_tryCon += 1
61: else:
62: self._v_dbCon = None
63: self._v_tryCon = 0
64:
65: if not curs and self._v_tryCon < 3:
66: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
67: # call ourself with the new connection
68: curs = self.dbCursor()
69:
70: assert curs, "AIIEE no db cursor!!"
71: return curs
72:
73: def getDBFileMeta(self, fileid):
74: """returns an array with all meta entries of fileid"""
75:
76: metacache = {}
77: # try in cache
78: if self.REQUEST.SESSION.has_key('dbMeta'):
79: metacache = self.REQUEST.SESSION['dbMeta']
80: if metacache.has_key(fileid):
81: res = metacache[fileid]
82: #print "meta from cache "
83: return res
84:
85: curs = self.dbCursor()
86:
87: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
88: print sql, " -> ", fileid
89: curs.execute(sql, {'id':fileid})
90: print "done"
91:
92: res = curs.fetchall()
93: #print "res:", res
94: curs.close()
95: # store info in cache
96: metacache[fileid] = res
97: self.REQUEST.SESSION['dbMeta'] = metacache
98:
99: return res
100:
101: def getDBFile(self, fileid):
102: """returns the file information of fileid"""
103:
104: filecache = {}
105: # try in cache
106: if self.REQUEST.SESSION.has_key('dbFiles'):
107: filecache = self.REQUEST.SESSION['dbFiles']
108: if filecache.has_key(fileid):
109: res = filecache[fileid]
110: #print "file from cache "
111: return res
112:
113: curs = self.dbCursor()
114:
115: sql = 'select filename,mtime from files where id=%(id)s'
116: print 'DBFILE: ', sql, " -> ", fileid
117: curs.execute(sql, {'id':fileid})
118: print "DBFILE: done"
119:
120: res = curs.fetchone()
121: #print "DBFILE: res:", res
122: curs.close()
123: # store info in cache
124: filecache[fileid] = res
125: self.REQUEST.SESSION['dbFiles'] = filecache
126:
127: return res
128:
129:
130: def dbSearch(self, query, type):
131: """search DB for query and return result set"""
132: results = []
133: restypes = {}
134: if not query:
135: # empty query
136: return results
137:
138: curs = self.dbCursor()
139: if type == 'equals':
140: qs = query
141: elif type == 'startswith':
142: qs = query + "%"
143: elif type == 'contains':
144: qs = "%" + query + "%"
145:
146: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
147: print sql, " -> ", qs
148: curs.execute(sql, {'qs':qs})
149: print "done"
150: res = curs.fetchone()
151: rescnt = 1
152: #print "res0:", res
153: while res and rescnt < MAXHITS:
154: #print "res:", res
155: result = self.getResult(res)
156: if result:
157: results.append(result)
158: restypes[result.type] = result.type
159:
160: res = curs.fetchone()
161: rescnt += 1
162:
163: curs.close()
164: #self.dbCon = None
165:
166: #print "SEARCH: ", rescnt, " results"
167: restypelist = restypes.keys()
168: return (results, restypelist)
169:
170:
171: def getResult(self, db_result, rank=0):
172: """factory for result objects"""
173:
174: (fileid, tagidx, tags, content) = db_result
175: res = None
176:
177: if tags.find('/meta/bib/') > -1:
178: res = BibResult(self, db_result, rank)
179: elif tags.find('/meta/archimedes/') > -1:
180: res = ArchimResult(self, db_result, rank)
181: else:
182: res = AnyResult(self, db_result, rank)
183:
184: return res
185:
186:
187: def renderResult(self, result):
188: """returns HTML rendering of a search result"""
189:
190: return result.render(self)
191:
192:
193: def filterResults(self, results, start, end, restypefilter=None):
194: """returns list of results that match a filter"""
195: # filter types first
196: if restypefilter:
197: res = []
198: for r in results:
199: if r.type == restypefilter:
200: res.append(r)
201: else:
202: res = results
203: # new total count (because of filter)
204: rescnt = len(res)
205: # filter on count
206: resgroup = res[start:end]
207:
208: return (resgroup, rescnt)
209:
210:
211: #
212: # Web page stuff
213: #
214:
215: def index_html(self):
216: """metadata search"""
217: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
218: return pt()
219:
220:
221: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
222: """search and create result"""
223: sres = int(start) -1
224: lres = sres + count
225: try:
226: oldsearch = self.REQUEST.SESSION['searchstring']
227: oldtype = self.REQUEST.SESSION['searchtype']
228: except:
229: oldsearch = ""
230: oldtype = ""
231:
232: if not searchstring:
233: searchstring = oldsearch
234: searchtype = oldtype
235:
236: if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
237: # new search
238: (res, restypes) = self.dbSearch(searchstring, searchtype)
239: # sort the result
240: res.sort(ranksort)
241: # store it
242: self.REQUEST.SESSION['results'] = res
243: self.REQUEST.SESSION['searchstring'] = searchstring
244: self.REQUEST.SESSION['searchtype'] = searchtype
245: self.REQUEST.SESSION['resulttypes'] = restypes
246:
247: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
248: lres = min(lres, nres)
249: sres = min(sres, nres)
250: self.REQUEST.SESSION['resultgroup'] = resgroup
251: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
252: self.REQUEST.SESSION['res_type_filter'] = restypefilter
253: if nres > 0:
254: zpt = "zpt/searchResult.zpt"
255: else:
256: zpt = "zpt/searchResult_none.zpt"
257:
258: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
259: return pt()
260:
261:
262: def getSearchType(self):
263: """returns the last search type"""
264: try:
265: ret = self.REQUEST.SESSION['searchtype']
266: except:
267: ret = ""
268:
269: return ret
270:
271: def getSearchString(self):
272: """returns the last search string"""
273: try:
274: ret = self.REQUEST.SESSION['searchstring']
275: except:
276: ret = ""
277:
278: return ret
279:
280:
281: def hasNextResults(self):
282: """returns if there are more results"""
283: try:
284: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
285: return (first + count < total)
286: except:
287: return False
288:
289: def hasPrevResults(self):
290: """returns if there are previous results"""
291: try:
292: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
293: return (first > 1)
294: except:
295: return False
296:
297:
298: def nextResults(self):
299: """returns more results"""
300: try:
301: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
302: first = first + count
303: last = last + count
304: if first > total:
305: first = total
306: if last > total:
307: last = total
308: except:
309: print "OUCH: no next results!"
310: return self.search()
311:
312: return self.search(start=first, count=count)
313:
314:
315: def prevResults(self):
316: """returns more results"""
317: try:
318: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
319: first = first - count
320: last = last - count
321: if first < 1:
322: first = 1
323: if last < 1:
324: last = 1
325: except:
326: print "OUCH: no prev results!"
327: return self.search()
328:
329: return self.search(start=first, count=count)
330:
331:
332: def manage_ChangeOSAS_searchForm(self):
333: """create Search form"""
334: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self)
335: return pt()
336:
337: def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
338: """add the OSAS_root"""
339: self.id = id
340: self.title = title
341: self.dsn = dsn
342: if RESPONSE is not None:
343: RESPONSE.redirect('manage_main')
344:
345:
346: def manage_AddOSAS_searchForm(self):
347: """create Search form"""
348: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
349: return pt()
350:
351: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
352: """add the OSAS_root"""
353: newObj=OSAS_search(id,title,dsn)
354: self._setObject(id,newObj)
355: if RESPONSE is not None:
356: RESPONSE.redirect('manage_main')
357:
358:
359:
360:
361: class SearchResult(SimpleItem):
362: """base search result object"""
363:
364: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
365: """init"""
366: # result type (e.g. "bib", "archim")
367: self.type = type
368: # index file name
369: self.file = file
370: # url for result (list of pairs)
371: if url:
372: self.urls = url
373: else:
374: self.urls = []
375: # actual content (list of tuples)
376: self.content = content
377: # document status (e.g. "online", "archive")
378: self.status = None
379: # result rank for presentation
380: self.rank = rank
381:
382: class AnyResult(SearchResult):
383: """catch-all type result object"""
384:
385: def __init__(self, zope, db_result, rank):
386: """returns a catch-all type result"""
387: SearchResult.__init__(self)
388: #print "NEW ANY RESULT!"
389: self.type='unknown'
390: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
391:
392: (db_fileid, db_tagidx, db_tags, db_content) = db_result
393: self.hitTag = db_tags
394:
395: # get full info from db
396: self.fileinfo = zope.getDBFile(db_fileid)
397: assert self.fileinfo
398:
399: items = {}
400: items[db_tags] = db_content
401: self.content = items
402: self.file = self.fileinfo[0]
403: self.status = statusForFile(self.file)
404: self.rank = rank
405:
406: def getContentList(self):
407: """returns content as list of tuples in preferred order"""
408: l = []
409: for k in self.content.keys():
410: l.append((k, self.content[k]))
411:
412: return l
413:
414: def render(self, zope):
415: """render this result object"""
416: zope.REQUEST.SESSION['result'] = self
417: pt=PageTemplateFile(self.zptFile).__of__(zope)
418: return pt()
419:
420:
421: class MetaResult(AnyResult):
422: """result object that collects metadata"""
423:
424: def __init__(self, zope, db_result, rank):
425: """contructor"""
426: AnyResult.__init__(self, zope, db_result, rank)
427: #print "NEW META RESULT!"
428:
429: (fileid, tagidx, tags, content) = db_result
430:
431: # get full info from db
432: self.metainfo = zope.getDBFileMeta(fileid)
433: assert self.metainfo
434:
435: def checkContext(self, tags, content, ctxurl):
436: """takes meta entry and updates url from context tags"""
437: if tags.endswith('/context/link'):
438: if content:
439: #print "CTXlink: ", content
440: ctxurl[0] = content
441:
442: elif tags.endswith('/context/name'):
443: if content:
444: #print "CTXname: ", content
445: ctxurl[1] = content
446:
447: return ctxurl
448:
449:
450: class BibResult(MetaResult):
451: """bib type result object"""
452:
453: def __init__(self, zope, db_result, rank):
454: """constructor"""
455: MetaResult.__init__(self, zope, db_result, rank)
456: #print "NEW BIB RESULT!", self
457: self.type = "bib"
458: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
459: url = storageURL(self.file)
460: if url:
461: self.urls.append(url)
462: (fileid, tagidx, tags, content) = db_result
463:
464: btype = ""
465: bitems = {}
466: ctxurl = ['', '']
467:
468: for me in self.metainfo:
469: (m_idx, m_tags, m_content, m_attributes) = me
470: # context tag
471: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
472: # first tag with bib type attribute
473: if m_tags.endswith('/meta/bib'):
474: r = re.search('type="([^"]*)"', m_attributes)
475: if r:
476: btype = r.group(1)
477:
478: if not btype:
479: btype = "*unknown*"
480:
481: bitems['type'] = btype
482: continue
483:
484: # skip other tags
485: if not btype: continue
486:
487: # collect bib/something
488: r = re.search('/meta/bib/(.*)', m_tags)
489: if r:
490: k = r.group(1)
491: #print "CONTENT: ", m_content
492: bitems[k] = m_content
493: # remember hit tag
494: if m_tags == self.hitTag:
495: self.hitTag = k
496: continue
497:
498: self.content = bitems
499: # store context
500: if not ctxurl[1]:
501: ctxurl[1] = "View"
502: # must have link
503: if ctxurl[0]:
504: self.urls.append(ctxurl)
505:
506: self.rank += 100
507:
508: def getContentList(self):
509: """returns content as list of tuples in preferred order"""
510: l = []
511: c = self.content.copy()
512: # preferred items first
513: for k in ('author', 'title', 'journal', 'year'):
514: if c.has_key(k):
515: l.append((k, c[k]))
516: del c[k]
517:
518: # no type
519: del c['type']
520: # copy the rest
521: for k in c.keys():
522: l.append((k, c[k]))
523:
524: return l
525:
526:
527: class ArchimResult(MetaResult):
528: """archimedes type result object"""
529:
530: def __init__(self, zope, db_result, rank):
531: """constructor"""
532: MetaResult.__init__(self, zope, db_result, rank)
533: #print "NEW ARCHIM RESULT!", self
534: self.type = "archim"
535: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
536: url = storageURL(self.file)
537: if url:
538: self.urls.append(url)
539:
540: (fileid, tagidx, tags, content) = db_result
541:
542: # process info
543: bitems = {}
544: ctxurl = ['', '']
545: for me in self.metainfo:
546: (m_idx, m_tags, m_content, m_attributes) = me
547: # context tag
548: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
549: # collect archimedes/something
550: r = re.search('/meta/archimedes/(.*)', m_tags)
551: if r:
552: k = r.group(1)
553: #print "CONTENT: ", m_content
554: bitems[k] = m_content
555: # remember hit tag
556: if m_tags == self.hitTag:
557: self.hitTag = k
558: continue
559:
560: self.content = bitems
561: self.rank += 100
562: # store context
563: if not ctxurl[1]:
564: ctxurl[1] = "View"
565: # must have link
566: if ctxurl[0]:
567: self.urls.append(ctxurl)
568:
569:
570: def getContentList(self):
571: """returns content as list of tuples in preferred order"""
572: l = []
573: c = self.content.copy()
574: # preferred items first
575: for k in ('author', 'title', 'date', 'place'):
576: if c.has_key(k):
577: l.append((k, c[k]))
578: del c[k]
579:
580: # copy the rest
581: for k in c.keys():
582: l.append((k, c[k]))
583:
584: return l
585:
586:
587:
588:
589: def ranksort(res1, res2):
590: """sort results on rank"""
591: return cmp(res2.rank, res1.rank)
592:
593:
594: def statusForFile(filename):
595: """heuristic... returns status for a index file name"""
596: status = None
597: if filename.startswith('/mpiwg/online/'):
598: status = "online"
599: elif filename.startswith('/mpiwg/archive/'):
600: status = "archive"
601: elif filename.startswith('http://'):
602: status = "database"
603:
604: return status
605:
606: def storageURL(filename):
607: """heuristic... returns an URL for a index file name"""
608: url = None
609: name = None
610: if filename.startswith('/mpiwg/online/'):
611: #print "URLFORFILE: online ", filename
612: r = re.search('^(.*)/index.meta', filename)
613: if r:
614: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
615: name = "Storage System"
616:
617: elif filename.startswith('http://'):
618: #print "URLFORFILE: url ", filename
619: url = filename
620: name = "Online Database"
621:
622: if name and url:
623: return (url, name)
624:
625: return None
626:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>