Annotation of OSAS/OSA_system/OSAS_search.py, revision 1.5
1.1 casties 1: """Metadata search interface
2: ROC 2004, itgroup
3:
4: """
5:
6: from AccessControl import ClassSecurityInfo
7: from Globals import InitializeClass
8: from Globals import Persistent, package_home
9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
10: from Products.PageTemplates.PageTemplate import PageTemplate
11: from OFS.SimpleItem import SimpleItem
12: #from pyPgSQL import PgSQL
13: import psycopg as PgSQL
14:
15: import re
16: import os
17:
18: MAXHITS = 1000
19:
20: class OSAS_search(SimpleItem):
21: """Object for global metadata search"""
22:
23: meta_type="OSAS_search"
24:
25:
26:
27: def __init__(self,id,title,dsn=None):
28: """init"""
29: self.id=id
30: self.title=title
31: if dsn:
32: self.dsn = dsn
33: else:
34: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
35: # volatile database connection object
36: self._v_dbCon = None
37: self._v_tryCon = 0
38:
39:
40: def dbCursor(self):
41: """returns new SQL cursor object"""
42: curs = None
43: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
44: try:
45: curs = self._v_dbCon.cursor()
46: self._v_tryCon = 0
47: except:
48: # in case of problems reset dbCon
49: self._v_dbCon = None
50: self._v_tryCon += 1
51: else:
52: self._v_dbCon = None
53: self._v_tryCon = 0
54:
55: if not curs and self._v_tryCon < 3:
56: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
57: # call ourself with the new connection
58: curs = self.dbCursor()
59:
60: assert curs, "AIIEE no db cursor!!"
61: return curs
62:
63: def getDBFileMeta(self, fileid):
64: """returns an array with all meta entries of fileid"""
65:
66: metacache = {}
67: # try in cache
68: if self.REQUEST.SESSION.has_key('dbMeta'):
69: metacache = self.REQUEST.SESSION['dbMeta']
70: if metacache.has_key(fileid):
71: res = metacache[fileid]
1.2 casties 72: #print "meta from cache "
1.1 casties 73: return res
74:
75: curs = self.dbCursor()
76:
77: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
78: print sql, " -> ", fileid
79: curs.execute(sql, {'id':fileid})
80: print "done"
81:
82: res = curs.fetchall()
83: #print "res:", res
84: curs.close()
85: # store info in cache
86: metacache[fileid] = res
87: self.REQUEST.SESSION['dbMeta'] = metacache
88:
89: return res
90:
91: def getDBFile(self, fileid):
92: """returns the file information of fileid"""
93:
94: filecache = {}
95: # try in cache
96: if self.REQUEST.SESSION.has_key('dbFiles'):
97: filecache = self.REQUEST.SESSION['dbFiles']
98: if filecache.has_key(fileid):
99: res = filecache[fileid]
1.2 casties 100: #print "file from cache "
1.1 casties 101: return res
102:
103: curs = self.dbCursor()
104:
105: sql = 'select filename,mtime from files where id=%(id)s'
106: print 'DBFILE: ', sql, " -> ", fileid
107: curs.execute(sql, {'id':fileid})
108: print "DBFILE: done"
109:
110: res = curs.fetchone()
111: #print "DBFILE: res:", res
112: curs.close()
113: # store info in cache
114: filecache[fileid] = res
115: self.REQUEST.SESSION['dbFiles'] = filecache
116:
117: return res
118:
119:
1.2 casties 120: def dbSearch(self, query, type):
1.1 casties 121: """search DB for query and return result set"""
1.3 casties 122: results = []
123: restypes = {}
124: if not query:
125: # empty query
126: return results
127:
1.1 casties 128: curs = self.dbCursor()
1.2 casties 129: if type == 'equals':
130: qs = query
131: elif type == 'startswith':
132: qs = query + "%"
133: elif type == 'contains':
134: qs = "%" + query + "%"
135:
1.3 casties 136: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
1.1 casties 137: print sql, " -> ", qs
138: curs.execute(sql, {'qs':qs})
139: print "done"
140: res = curs.fetchone()
141: rescnt = 1
142: #print "res0:", res
143: while res and rescnt < MAXHITS:
144: #print "res:", res
145: result = self.getResult(res)
1.3 casties 146: if result:
1.1 casties 147: results.append(result)
1.3 casties 148: restypes[result.type] = result.type
1.1 casties 149:
150: res = curs.fetchone()
151: rescnt += 1
152:
153: curs.close()
154: #self.dbCon = None
155:
1.2 casties 156: #print "SEARCH: ", rescnt, " results"
1.3 casties 157: restypelist = restypes.keys()
158: return (results, restypelist)
1.1 casties 159:
160:
161: def getResult(self, db_result, rank=0):
162: """factory for result objects"""
163:
164: (fileid, tagidx, tags, content) = db_result
165: res = None
166:
167: if tags.find('/meta/bib/') > -1:
168: res = BibResult(self, db_result, rank)
169: elif tags.find('/meta/archimedes/') > -1:
170: res = ArchimResult(self, db_result, rank)
171: else:
172: res = AnyResult(self, db_result, rank)
173:
174: return res
1.3 casties 175:
1.1 casties 176:
177: def renderResult(self, result):
178: """returns HTML rendering of a search result"""
179:
180: return result.render(self)
181:
1.3 casties 182:
183: def filterResults(self, results, start, end, restypefilter=None):
184: """returns list of results that match a filter"""
185: # filter types first
186: if restypefilter:
187: res = []
188: for r in results:
189: if r.type in restypefilter:
190: res.append(r)
191: else:
192: res = results
1.5 ! casties 193: # new total count (because of filter)
! 194: rescnt = len(res)
1.3 casties 195: # filter on count
196: resgroup = res[start:end]
197:
198: return (resgroup, rescnt)
199:
1.1 casties 200:
201: #
202: # Web page stuff
203: #
204:
205: def index_html(self):
206: """metadata search"""
207: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
208: return pt()
209:
210:
1.3 casties 211: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
1.2 casties 212: """search and create result"""
213: sres = int(start) -1
214: lres = sres + count
215: try:
216: oldsearch = self.REQUEST.SESSION['searchstring']
217: oldtype = self.REQUEST.SESSION['searchtype']
218: except:
219: oldsearch = ""
220: oldtype = ""
221:
222: if not searchstring:
223: searchstring = oldsearch
224: searchtype = oldtype
225:
226: if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
227: # new search
1.3 casties 228: (res, restypes) = self.dbSearch(searchstring, searchtype)
1.2 casties 229: # sort the result
1.1 casties 230: res.sort(ranksort)
1.2 casties 231: # store it
1.1 casties 232: self.REQUEST.SESSION['results'] = res
233: self.REQUEST.SESSION['searchstring'] = searchstring
1.2 casties 234: self.REQUEST.SESSION['searchtype'] = searchtype
1.3 casties 235: self.REQUEST.SESSION['resulttypes'] = restypes
1.1 casties 236:
1.3 casties 237: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
238: lres = min(lres, nres)
1.5 ! casties 239: sres = min(sres, nres)
1.3 casties 240: self.REQUEST.SESSION['resultgroup'] = resgroup
241: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
242: self.REQUEST.SESSION['res_type_filter'] = restypefilter
243: if nres > 0:
244: zpt = "zpt/searchResult.zpt"
245: else:
246: zpt = "zpt/searchResult_none.zpt"
1.2 casties 247:
1.3 casties 248: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
1.1 casties 249: return pt()
250:
1.2 casties 251:
252: def getSearchType(self):
253: """returns the last search type"""
254: try:
255: ret = self.REQUEST.SESSION['searchtype']
256: except:
257: ret = ""
258:
259: return ret
260:
261: def getSearchString(self):
262: """returns the last search string"""
263: try:
264: ret = self.REQUEST.SESSION['searchstring']
265: except:
266: ret = ""
267:
268: return ret
269:
270:
271: def hasNextResults(self):
272: """returns if there are more results"""
273: try:
274: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
1.3 casties 275: return (first + count < total)
1.2 casties 276: except:
277: return False
278:
279: def hasPrevResults(self):
280: """returns if there are previous results"""
281: try:
282: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
283: return (first > 1)
284: except:
285: return False
286:
287:
288: def nextResults(self):
289: """returns more results"""
290: try:
291: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
292: first = first + count
293: last = last + count
294: if first > total:
295: first = total
296: if last > total:
297: last = total
298: except:
1.3 casties 299: print "OUCH: no next results!"
300: return self.search()
1.2 casties 301:
302: return self.search(start=first, count=count)
303:
304:
305: def prevResults(self):
306: """returns more results"""
307: try:
308: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
309: first = first - count
310: last = last - count
311: if first < 1:
312: first = 1
313: if last < 1:
314: last = 1
315: except:
1.3 casties 316: print "OUCH: no prev results!"
317: return self.search()
1.2 casties 318:
319: return self.search(start=first, count=count)
1.1 casties 320:
1.2 casties 321:
322:
1.1 casties 323: def manage_AddOSAS_searchForm(self):
324: """create Search form"""
325: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
326: return pt()
327:
328: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
329: """add the OSAS_root"""
330: newObj=OSAS_search(id,title,dsn)
331: self._setObject(id,newObj)
332: if RESPONSE is not None:
333: RESPONSE.redirect('manage_main')
334:
335:
336:
337:
338: class SearchResult(SimpleItem):
339: """base search result object"""
340:
341: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
342: """init"""
1.3 casties 343: # result type (e.g. "bib", "archim")
1.1 casties 344: self.type = type
1.3 casties 345: # index file name
1.1 casties 346: self.file = file
1.3 casties 347: # url for result (list of pairs)
348: if url:
349: self.urls = url
350: else:
351: self.urls = []
352: # actual content (list of tuples)
1.1 casties 353: self.content = content
1.3 casties 354: # document status (e.g. "online", "archive")
355: self.status = None
356: # result rank for presentation
1.1 casties 357: self.rank = rank
358:
359: class AnyResult(SearchResult):
360: """catch-all type result object"""
361:
362: def __init__(self, zope, db_result, rank):
363: """returns a catch-all type result"""
1.3 casties 364: SearchResult.__init__(self)
1.2 casties 365: #print "NEW ANY RESULT!"
1.3 casties 366: self.type='unknown'
1.1 casties 367: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
368:
1.3 casties 369: (db_fileid, db_tagidx, db_tags, db_content) = db_result
370: self.hitTag = db_tags
1.1 casties 371:
372: # get full info from db
1.3 casties 373: self.fileinfo = zope.getDBFile(db_fileid)
1.1 casties 374: assert self.fileinfo
375:
376: items = {}
1.3 casties 377: items[db_tags] = db_content
1.1 casties 378: self.content = items
379: self.file = self.fileinfo[0]
1.3 casties 380: self.status = statusForFile(self.file)
1.1 casties 381: self.rank = rank
382:
1.3 casties 383: def getContentList(self):
384: """returns content as list of tuples in preferred order"""
385: l = []
386: for k in self.content.keys():
387: l.append((k, self.content[k]))
388:
389: return l
390:
1.1 casties 391: def render(self, zope):
392: """render this result object"""
393: zope.REQUEST.SESSION['result'] = self
394: pt=PageTemplateFile(self.zptFile).__of__(zope)
395: return pt()
396:
397:
398: class MetaResult(AnyResult):
399: """result object that collects metadata"""
400:
401: def __init__(self, zope, db_result, rank):
402: """contructor"""
403: AnyResult.__init__(self, zope, db_result, rank)
1.2 casties 404: #print "NEW META RESULT!"
1.1 casties 405:
406: (fileid, tagidx, tags, content) = db_result
407:
408: # get full info from db
409: self.metainfo = zope.getDBFileMeta(fileid)
410: assert self.metainfo
411:
1.3 casties 412: def checkContext(self, tags, content, ctxurl):
413: """takes meta entry and updates url from context tags"""
1.1 casties 414: if tags.endswith('/context/link'):
415: if content:
1.3 casties 416: #print "CTXlink: ", content
417: ctxurl[0] = content
1.1 casties 418:
419: elif tags.endswith('/context/name'):
420: if content:
1.3 casties 421: #print "CTXname: ", content
422: ctxurl[1] = content
1.1 casties 423:
1.3 casties 424: return ctxurl
1.1 casties 425:
426:
427: class BibResult(MetaResult):
428: """bib type result object"""
429:
430: def __init__(self, zope, db_result, rank):
431: """constructor"""
432: MetaResult.__init__(self, zope, db_result, rank)
1.3 casties 433: #print "NEW BIB RESULT!", self
1.1 casties 434: self.type = "bib"
435: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
1.3 casties 436: url = storageURL(self.file)
437: if url:
438: self.urls.append(url)
1.1 casties 439: (fileid, tagidx, tags, content) = db_result
440:
441: btype = ""
442: bitems = {}
1.3 casties 443: ctxurl = ['', '']
1.1 casties 444:
445: for me in self.metainfo:
446: (m_idx, m_tags, m_content, m_attributes) = me
447: # context tag
1.3 casties 448: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1 casties 449: # first tag with bib type attribute
450: if m_tags.endswith('/meta/bib'):
451: r = re.search('type="([^"]*)"', m_attributes)
452: if r:
453: btype = r.group(1)
454:
455: if not btype:
456: btype = "*unknown*"
457:
458: bitems['type'] = btype
459: continue
460:
461: # skip other tags
462: if not btype: continue
463:
464: # collect bib/something
465: r = re.search('/meta/bib/(.*)', m_tags)
466: if r:
467: k = r.group(1)
468: #print "CONTENT: ", m_content
469: bitems[k] = m_content
1.3 casties 470: # remember hit tag
471: if m_tags == self.hitTag:
472: self.hitTag = k
1.1 casties 473: continue
474:
475: self.content = bitems
1.3 casties 476: # store context
477: if not ctxurl[1]:
478: ctxurl[1] = "View"
479: # must have link
480: if ctxurl[0]:
481: self.urls.append(ctxurl)
482:
1.1 casties 483: self.rank += 100
1.3 casties 484:
485: def getContentList(self):
486: """returns content as list of tuples in preferred order"""
487: l = []
488: c = self.content.copy()
489: # preferred items first
490: for k in ('author', 'title', 'journal', 'year'):
491: if c.has_key(k):
492: l.append((k, c[k]))
493: del c[k]
494:
495: # no type
496: del c['type']
497: # copy the rest
498: for k in c.keys():
499: l.append((k, c[k]))
500:
501: return l
1.1 casties 502:
503:
504: class ArchimResult(MetaResult):
505: """archimedes type result object"""
506:
507: def __init__(self, zope, db_result, rank):
508: """constructor"""
509: MetaResult.__init__(self, zope, db_result, rank)
1.3 casties 510: #print "NEW ARCHIM RESULT!", self
1.1 casties 511: self.type = "archim"
512: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
1.3 casties 513: url = storageURL(self.file)
514: if url:
515: self.urls.append(url)
516:
1.1 casties 517: (fileid, tagidx, tags, content) = db_result
518:
519: # process info
520: bitems = {}
1.3 casties 521: ctxurl = ['', '']
1.1 casties 522: for me in self.metainfo:
523: (m_idx, m_tags, m_content, m_attributes) = me
524: # context tag
1.3 casties 525: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1 casties 526: # collect archimedes/something
527: r = re.search('/meta/archimedes/(.*)', m_tags)
528: if r:
529: k = r.group(1)
530: #print "CONTENT: ", m_content
531: bitems[k] = m_content
1.3 casties 532: # remember hit tag
533: if m_tags == self.hitTag:
534: self.hitTag = k
1.1 casties 535: continue
536:
537: self.content = bitems
538: self.rank += 100
1.3 casties 539: # store context
540: if not ctxurl[1]:
541: ctxurl[1] = "View"
542: # must have link
543: if ctxurl[0]:
544: self.urls.append(ctxurl)
545:
546:
547: def getContentList(self):
548: """returns content as list of tuples in preferred order"""
549: l = []
550: c = self.content.copy()
551: # preferred items first
552: for k in ('author', 'title', 'date', 'place'):
553: if c.has_key(k):
554: l.append((k, c[k]))
555: del c[k]
556:
557: # copy the rest
558: for k in c.keys():
559: l.append((k, c[k]))
560:
561: return l
1.1 casties 562:
563:
564:
565:
566: def ranksort(res1, res2):
567: """sort results on rank"""
568: return cmp(res2.rank, res1.rank)
569:
570:
1.3 casties 571: def statusForFile(filename):
572: """heuristic... returns status for a index file name"""
573: status = None
574: if filename.startswith('/mpiwg/online/'):
575: status = "online"
576: elif filename.startswith('/mpiwg/archive/'):
577: status = "archive"
578: elif filename.startswith('http://'):
579: status = "database"
580:
581: return status
582:
583: def storageURL(filename):
1.1 casties 584: """heuristic... returns an URL for a index file name"""
585: url = None
1.3 casties 586: name = None
1.1 casties 587: if filename.startswith('/mpiwg/online/'):
1.3 casties 588: #print "URLFORFILE: online ", filename
589: r = re.search('^(.*)/index.meta', filename)
1.1 casties 590: if r:
1.3 casties 591: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
592: name = "Storage System"
1.4 casties 593:
594: elif filename.startswith('http://'):
595: #print "URLFORFILE: url ", filename
596: url = filename
597: name = "Online Database"
1.3 casties 598:
599: if name and url:
600: return (url, name)
601:
602: return None
1.1 casties 603:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>