Annotation of OSAS/OSA_system/OSAS_search.py, revision 1.3
1.1 casties 1: """Metadata search interface
2: ROC 2004, itgroup
3:
4: """
5:
6: from AccessControl import ClassSecurityInfo
7: from Globals import InitializeClass
8: from Globals import Persistent, package_home
9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
10: from Products.PageTemplates.PageTemplate import PageTemplate
11: from OFS.SimpleItem import SimpleItem
12: #from pyPgSQL import PgSQL
13: import psycopg as PgSQL
14:
15: import re
16: import os
17:
18: MAXHITS = 1000
19:
20: class OSAS_search(SimpleItem):
21: """Object for global metadata search"""
22:
23: meta_type="OSAS_search"
24:
25:
26:
27: def __init__(self,id,title,dsn=None):
28: """init"""
29: self.id=id
30: self.title=title
31: if dsn:
32: self.dsn = dsn
33: else:
34: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
35: # volatile database connection object
36: self._v_dbCon = None
37: self._v_tryCon = 0
38:
39:
40: def dbCursor(self):
41: """returns new SQL cursor object"""
42: curs = None
43: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
44: try:
45: curs = self._v_dbCon.cursor()
46: self._v_tryCon = 0
47: except:
48: # in case of problems reset dbCon
49: self._v_dbCon = None
50: self._v_tryCon += 1
51: else:
52: self._v_dbCon = None
53: self._v_tryCon = 0
54:
55: if not curs and self._v_tryCon < 3:
56: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
57: # call ourself with the new connection
58: curs = self.dbCursor()
59:
60: assert curs, "AIIEE no db cursor!!"
61: return curs
62:
63: def getDBFileMeta(self, fileid):
64: """returns an array with all meta entries of fileid"""
65:
66: metacache = {}
67: # try in cache
68: if self.REQUEST.SESSION.has_key('dbMeta'):
69: metacache = self.REQUEST.SESSION['dbMeta']
70: if metacache.has_key(fileid):
71: res = metacache[fileid]
1.2 casties 72: #print "meta from cache "
1.1 casties 73: return res
74:
75: curs = self.dbCursor()
76:
77: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
78: print sql, " -> ", fileid
79: curs.execute(sql, {'id':fileid})
80: print "done"
81:
82: res = curs.fetchall()
83: #print "res:", res
84: curs.close()
85: # store info in cache
86: metacache[fileid] = res
87: self.REQUEST.SESSION['dbMeta'] = metacache
88:
89: return res
90:
91: def getDBFile(self, fileid):
92: """returns the file information of fileid"""
93:
94: filecache = {}
95: # try in cache
96: if self.REQUEST.SESSION.has_key('dbFiles'):
97: filecache = self.REQUEST.SESSION['dbFiles']
98: if filecache.has_key(fileid):
99: res = filecache[fileid]
1.2 casties 100: #print "file from cache "
1.1 casties 101: return res
102:
103: curs = self.dbCursor()
104:
105: sql = 'select filename,mtime from files where id=%(id)s'
106: print 'DBFILE: ', sql, " -> ", fileid
107: curs.execute(sql, {'id':fileid})
108: print "DBFILE: done"
109:
110: res = curs.fetchone()
111: #print "DBFILE: res:", res
112: curs.close()
113: # store info in cache
114: filecache[fileid] = res
115: self.REQUEST.SESSION['dbFiles'] = filecache
116:
117: return res
118:
119:
1.2 casties 120: def dbSearch(self, query, type):
1.1 casties 121: """search DB for query and return result set"""
1.3 ! casties 122: results = []
! 123: restypes = {}
! 124: if not query:
! 125: # empty query
! 126: return results
! 127:
1.1 casties 128: curs = self.dbCursor()
1.2 casties 129: if type == 'equals':
130: qs = query
131: elif type == 'startswith':
132: qs = query + "%"
133: elif type == 'contains':
134: qs = "%" + query + "%"
135:
1.3 ! casties 136: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
1.1 casties 137: print sql, " -> ", qs
138: curs.execute(sql, {'qs':qs})
139: print "done"
140: res = curs.fetchone()
141: rescnt = 1
142: #print "res0:", res
143: while res and rescnt < MAXHITS:
144: #print "res:", res
145: result = self.getResult(res)
1.3 ! casties 146: if result:
1.1 casties 147: results.append(result)
1.3 ! casties 148: restypes[result.type] = result.type
1.1 casties 149:
150: res = curs.fetchone()
151: rescnt += 1
152:
153: curs.close()
154: #self.dbCon = None
155:
1.2 casties 156: #print "SEARCH: ", rescnt, " results"
1.3 ! casties 157: restypelist = restypes.keys()
! 158: return (results, restypelist)
1.1 casties 159:
160:
161: def getResult(self, db_result, rank=0):
162: """factory for result objects"""
163:
164: (fileid, tagidx, tags, content) = db_result
165: res = None
166:
167: if tags.find('/meta/bib/') > -1:
168: res = BibResult(self, db_result, rank)
169: elif tags.find('/meta/archimedes/') > -1:
170: res = ArchimResult(self, db_result, rank)
171: else:
172: res = AnyResult(self, db_result, rank)
173:
174: return res
1.3 ! casties 175:
1.1 casties 176:
177: def renderResult(self, result):
178: """returns HTML rendering of a search result"""
179:
180: return result.render(self)
181:
1.3 ! casties 182:
! 183: def filterResults(self, results, start, end, restypefilter=None):
! 184: """returns list of results that match a filter"""
! 185: # filter types first
! 186: if restypefilter:
! 187: res = []
! 188: for r in results:
! 189: if r.type in restypefilter:
! 190: res.append(r)
! 191: else:
! 192: res = results
! 193: # filter on count
! 194: resgroup = res[start:end]
! 195: # new total count (because of filter)
! 196: rescnt = len(res)
! 197:
! 198: return (resgroup, rescnt)
! 199:
1.1 casties 200:
201: #
202: # Web page stuff
203: #
204:
205: def index_html(self):
206: """metadata search"""
207: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
208: return pt()
209:
210:
1.3 ! casties 211: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
1.2 casties 212: """search and create result"""
213: sres = int(start) -1
214: lres = sres + count
215: try:
216: oldsearch = self.REQUEST.SESSION['searchstring']
217: oldtype = self.REQUEST.SESSION['searchtype']
218: except:
219: oldsearch = ""
220: oldtype = ""
221:
222: if not searchstring:
223: searchstring = oldsearch
224: searchtype = oldtype
225:
226: if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
227: # new search
1.3 ! casties 228: (res, restypes) = self.dbSearch(searchstring, searchtype)
1.2 casties 229: # sort the result
1.1 casties 230: res.sort(ranksort)
1.2 casties 231: # store it
1.1 casties 232: self.REQUEST.SESSION['results'] = res
233: self.REQUEST.SESSION['searchstring'] = searchstring
1.2 casties 234: self.REQUEST.SESSION['searchtype'] = searchtype
1.3 ! casties 235: self.REQUEST.SESSION['resulttypes'] = restypes
1.1 casties 236:
1.3 ! casties 237: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
! 238: lres = min(lres, nres)
! 239: self.REQUEST.SESSION['resultgroup'] = resgroup
! 240: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
! 241: self.REQUEST.SESSION['res_type_filter'] = restypefilter
! 242: if nres > 0:
! 243: zpt = "zpt/searchResult.zpt"
! 244: else:
! 245: zpt = "zpt/searchResult_none.zpt"
1.2 casties 246:
1.3 ! casties 247: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
1.1 casties 248: return pt()
249:
1.2 casties 250:
251: def getSearchType(self):
252: """returns the last search type"""
253: try:
254: ret = self.REQUEST.SESSION['searchtype']
255: except:
256: ret = ""
257:
258: return ret
259:
260: def getSearchString(self):
261: """returns the last search string"""
262: try:
263: ret = self.REQUEST.SESSION['searchstring']
264: except:
265: ret = ""
266:
267: return ret
268:
269:
270: def hasNextResults(self):
271: """returns if there are more results"""
272: try:
273: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
1.3 ! casties 274: return (first + count < total)
1.2 casties 275: except:
276: return False
277:
278: def hasPrevResults(self):
279: """returns if there are previous results"""
280: try:
281: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
282: return (first > 1)
283: except:
284: return False
285:
286:
287: def nextResults(self):
288: """returns more results"""
289: try:
290: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
291: first = first + count
292: last = last + count
293: if first > total:
294: first = total
295: if last > total:
296: last = total
297: except:
1.3 ! casties 298: print "OUCH: no next results!"
! 299: return self.search()
1.2 casties 300:
301: return self.search(start=first, count=count)
302:
303:
304: def prevResults(self):
305: """returns more results"""
306: try:
307: (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
308: first = first - count
309: last = last - count
310: if first < 1:
311: first = 1
312: if last < 1:
313: last = 1
314: except:
1.3 ! casties 315: print "OUCH: no prev results!"
! 316: return self.search()
1.2 casties 317:
318: return self.search(start=first, count=count)
1.1 casties 319:
1.2 casties 320:
321:
1.1 casties 322: def manage_AddOSAS_searchForm(self):
323: """create Search form"""
324: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
325: return pt()
326:
327: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
328: """add the OSAS_root"""
329: newObj=OSAS_search(id,title,dsn)
330: self._setObject(id,newObj)
331: if RESPONSE is not None:
332: RESPONSE.redirect('manage_main')
333:
334:
335:
336:
337: class SearchResult(SimpleItem):
338: """base search result object"""
339:
340: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
341: """init"""
1.3 ! casties 342: # result type (e.g. "bib", "archim")
1.1 casties 343: self.type = type
1.3 ! casties 344: # index file name
1.1 casties 345: self.file = file
1.3 ! casties 346: # url for result (list of pairs)
! 347: if url:
! 348: self.urls = url
! 349: else:
! 350: self.urls = []
! 351: # actual content (list of tuples)
1.1 casties 352: self.content = content
1.3 ! casties 353: # document status (e.g. "online", "archive")
! 354: self.status = None
! 355: # result rank for presentation
1.1 casties 356: self.rank = rank
357:
358: class AnyResult(SearchResult):
359: """catch-all type result object"""
360:
361: def __init__(self, zope, db_result, rank):
362: """returns a catch-all type result"""
1.3 ! casties 363: SearchResult.__init__(self)
1.2 casties 364: #print "NEW ANY RESULT!"
1.3 ! casties 365: self.type='unknown'
1.1 casties 366: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
367:
1.3 ! casties 368: (db_fileid, db_tagidx, db_tags, db_content) = db_result
! 369: self.hitTag = db_tags
1.1 casties 370:
371: # get full info from db
1.3 ! casties 372: self.fileinfo = zope.getDBFile(db_fileid)
1.1 casties 373: assert self.fileinfo
374:
375: items = {}
1.3 ! casties 376: items[db_tags] = db_content
1.1 casties 377: self.content = items
378: self.file = self.fileinfo[0]
1.3 ! casties 379: self.status = statusForFile(self.file)
1.1 casties 380: self.rank = rank
381:
1.3 ! casties 382: def getContentList(self):
! 383: """returns content as list of tuples in preferred order"""
! 384: l = []
! 385: for k in self.content.keys():
! 386: l.append((k, self.content[k]))
! 387:
! 388: return l
! 389:
1.1 casties 390: def render(self, zope):
391: """render this result object"""
392: zope.REQUEST.SESSION['result'] = self
393: pt=PageTemplateFile(self.zptFile).__of__(zope)
394: return pt()
395:
396:
397: class MetaResult(AnyResult):
398: """result object that collects metadata"""
399:
400: def __init__(self, zope, db_result, rank):
401: """contructor"""
402: AnyResult.__init__(self, zope, db_result, rank)
1.2 casties 403: #print "NEW META RESULT!"
1.1 casties 404:
405: (fileid, tagidx, tags, content) = db_result
406:
407: # get full info from db
408: self.metainfo = zope.getDBFileMeta(fileid)
409: assert self.metainfo
410:
1.3 ! casties 411: def checkContext(self, tags, content, ctxurl):
! 412: """takes meta entry and updates url from context tags"""
1.1 casties 413: if tags.endswith('/context/link'):
414: if content:
1.3 ! casties 415: #print "CTXlink: ", content
! 416: ctxurl[0] = content
1.1 casties 417:
418: elif tags.endswith('/context/name'):
419: if content:
1.3 ! casties 420: #print "CTXname: ", content
! 421: ctxurl[1] = content
1.1 casties 422:
1.3 ! casties 423: return ctxurl
1.1 casties 424:
425:
426: class BibResult(MetaResult):
427: """bib type result object"""
428:
429: def __init__(self, zope, db_result, rank):
430: """constructor"""
431: MetaResult.__init__(self, zope, db_result, rank)
1.3 ! casties 432: #print "NEW BIB RESULT!", self
1.1 casties 433: self.type = "bib"
434: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
1.3 ! casties 435: url = storageURL(self.file)
! 436: if url:
! 437: self.urls.append(url)
1.1 casties 438: (fileid, tagidx, tags, content) = db_result
439:
440: btype = ""
441: bitems = {}
1.3 ! casties 442: ctxurl = ['', '']
1.1 casties 443:
444: for me in self.metainfo:
445: (m_idx, m_tags, m_content, m_attributes) = me
446: # context tag
1.3 ! casties 447: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1 casties 448: # first tag with bib type attribute
449: if m_tags.endswith('/meta/bib'):
450: r = re.search('type="([^"]*)"', m_attributes)
451: if r:
452: btype = r.group(1)
453:
454: if not btype:
455: btype = "*unknown*"
456:
457: bitems['type'] = btype
458: continue
459:
460: # skip other tags
461: if not btype: continue
462:
463: # collect bib/something
464: r = re.search('/meta/bib/(.*)', m_tags)
465: if r:
466: k = r.group(1)
467: #print "CONTENT: ", m_content
468: bitems[k] = m_content
1.3 ! casties 469: # remember hit tag
! 470: if m_tags == self.hitTag:
! 471: self.hitTag = k
1.1 casties 472: continue
473:
474: self.content = bitems
1.3 ! casties 475: # store context
! 476: if not ctxurl[1]:
! 477: ctxurl[1] = "View"
! 478: # must have link
! 479: if ctxurl[0]:
! 480: self.urls.append(ctxurl)
! 481:
1.1 casties 482: self.rank += 100
1.3 ! casties 483:
! 484: def getContentList(self):
! 485: """returns content as list of tuples in preferred order"""
! 486: l = []
! 487: c = self.content.copy()
! 488: # preferred items first
! 489: for k in ('author', 'title', 'journal', 'year'):
! 490: if c.has_key(k):
! 491: l.append((k, c[k]))
! 492: del c[k]
! 493:
! 494: # no type
! 495: del c['type']
! 496: # copy the rest
! 497: for k in c.keys():
! 498: l.append((k, c[k]))
! 499:
! 500: return l
1.1 casties 501:
502:
503: class ArchimResult(MetaResult):
504: """archimedes type result object"""
505:
506: def __init__(self, zope, db_result, rank):
507: """constructor"""
508: MetaResult.__init__(self, zope, db_result, rank)
1.3 ! casties 509: #print "NEW ARCHIM RESULT!", self
1.1 casties 510: self.type = "archim"
511: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
1.3 ! casties 512: url = storageURL(self.file)
! 513: if url:
! 514: self.urls.append(url)
! 515:
1.1 casties 516: (fileid, tagidx, tags, content) = db_result
517:
518: # process info
519: bitems = {}
1.3 ! casties 520: ctxurl = ['', '']
1.1 casties 521: for me in self.metainfo:
522: (m_idx, m_tags, m_content, m_attributes) = me
523: # context tag
1.3 ! casties 524: ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1 casties 525: # collect archimedes/something
526: r = re.search('/meta/archimedes/(.*)', m_tags)
527: if r:
528: k = r.group(1)
529: #print "CONTENT: ", m_content
530: bitems[k] = m_content
1.3 ! casties 531: # remember hit tag
! 532: if m_tags == self.hitTag:
! 533: self.hitTag = k
1.1 casties 534: continue
535:
536: self.content = bitems
537: self.rank += 100
1.3 ! casties 538: # store context
! 539: if not ctxurl[1]:
! 540: ctxurl[1] = "View"
! 541: # must have link
! 542: if ctxurl[0]:
! 543: self.urls.append(ctxurl)
! 544:
! 545:
! 546: def getContentList(self):
! 547: """returns content as list of tuples in preferred order"""
! 548: l = []
! 549: c = self.content.copy()
! 550: # preferred items first
! 551: for k in ('author', 'title', 'date', 'place'):
! 552: if c.has_key(k):
! 553: l.append((k, c[k]))
! 554: del c[k]
! 555:
! 556: # copy the rest
! 557: for k in c.keys():
! 558: l.append((k, c[k]))
! 559:
! 560: return l
1.1 casties 561:
562:
563:
564:
565: def ranksort(res1, res2):
566: """sort results on rank"""
567: return cmp(res2.rank, res1.rank)
568:
569:
1.3 ! casties 570: def statusForFile(filename):
! 571: """heuristic... returns status for a index file name"""
! 572: status = None
! 573: if filename.startswith('/mpiwg/online/'):
! 574: status = "online"
! 575: elif filename.startswith('/mpiwg/archive/'):
! 576: status = "archive"
! 577: elif filename.startswith('http://'):
! 578: status = "database"
! 579:
! 580: return status
! 581:
! 582: def storageURL(filename):
1.1 casties 583: """heuristic... returns an URL for a index file name"""
584: url = None
1.3 ! casties 585: name = None
1.1 casties 586: if filename.startswith('/mpiwg/online/'):
1.3 ! casties 587: #print "URLFORFILE: online ", filename
! 588: r = re.search('^(.*)/index.meta', filename)
1.1 casties 589: if r:
1.3 ! casties 590: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
! 591: name = "Storage System"
! 592:
! 593: if name and url:
! 594: return (url, name)
! 595:
! 596: return None
1.1 casties 597:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>