OSAS/OSA_system/OSAS_search.py - view

File: [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Fri Jul 9 16:55:19 2004 UTC (21 years ago) by casties
Branches: MAIN
CVS tags: HEAD

more refinements

1: """Metadata search interface 2: ROC 2004, itgroup 3: 4: """ 5: 6: from AccessControl import ClassSecurityInfo 7: from Globals import InitializeClass 8: from Globals import Persistent, package_home 9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile 10: from Products.PageTemplates.PageTemplate import PageTemplate 11: from OFS.SimpleItem import SimpleItem 12: #from pyPgSQL import PgSQL 13: import psycopg as PgSQL 14: 15: import re 16: import os 17: 18: MAXHITS = 1000 19: 20: class OSAS_search(SimpleItem): 21: """Object for global metadata search""" 22: 23: meta_type="OSAS_search" 24: 25: 26: 27: def __init__(self,id,title,dsn=None): 28: """init""" 29: self.id=id 30: self.title=title 31: if dsn: 32: self.dsn = dsn 33: else: 34: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread" 35: # volatile database connection object 36: self._v_dbCon = None 37: self._v_tryCon = 0 38: 39: 40: def dbCursor(self): 41: """returns new SQL cursor object""" 42: curs = None 43: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None: 44: try: 45: curs = self._v_dbCon.cursor() 46: self._v_tryCon = 0 47: except: 48: # in case of problems reset dbCon 49: self._v_dbCon = None 50: self._v_tryCon += 1 51: else: 52: self._v_dbCon = None 53: self._v_tryCon = 0 54: 55: if not curs and self._v_tryCon < 3: 56: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0) 57: # call ourself with the new connection 58: curs = self.dbCursor() 59: 60: assert curs, "AIIEE no db cursor!!" 61: return curs 62: 63: def getDBFileMeta(self, fileid): 64: """returns an array with all meta entries of fileid""" 65: 66: metacache = {} 67: # try in cache 68: if self.REQUEST.SESSION.has_key('dbMeta'): 69: metacache = self.REQUEST.SESSION['dbMeta'] 70: if metacache.has_key(fileid): 71: res = metacache[fileid] 72: #print "meta from cache " 73: return res 74: 75: curs = self.dbCursor() 76: 77: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx' 78: print sql, " -> ", fileid 79: curs.execute(sql, {'id':fileid}) 80: print "done" 81: 82: res = curs.fetchall() 83: #print "res:", res 84: curs.close() 85: # store info in cache 86: metacache[fileid] = res 87: self.REQUEST.SESSION['dbMeta'] = metacache 88: 89: return res 90: 91: def getDBFile(self, fileid): 92: """returns the file information of fileid""" 93: 94: filecache = {} 95: # try in cache 96: if self.REQUEST.SESSION.has_key('dbFiles'): 97: filecache = self.REQUEST.SESSION['dbFiles'] 98: if filecache.has_key(fileid): 99: res = filecache[fileid] 100: #print "file from cache " 101: return res 102: 103: curs = self.dbCursor() 104: 105: sql = 'select filename,mtime from files where id=%(id)s' 106: print 'DBFILE: ', sql, " -> ", fileid 107: curs.execute(sql, {'id':fileid}) 108: print "DBFILE: done" 109: 110: res = curs.fetchone() 111: #print "DBFILE: res:", res 112: curs.close() 113: # store info in cache 114: filecache[fileid] = res 115: self.REQUEST.SESSION['dbFiles'] = filecache 116: 117: return res 118: 119: 120: def dbSearch(self, query, type): 121: """search DB for query and return result set""" 122: results = [] 123: restypes = {} 124: if not query: 125: # empty query 126: return results 127: 128: curs = self.dbCursor() 129: if type == 'equals': 130: qs = query 131: elif type == 'startswith': 132: qs = query + "%" 133: elif type == 'contains': 134: qs = "%" + query + "%" 135: 136: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)' 137: print sql, " -> ", qs 138: curs.execute(sql, {'qs':qs}) 139: print "done" 140: res = curs.fetchone() 141: rescnt = 1 142: #print "res0:", res 143: while res and rescnt < MAXHITS: 144: #print "res:", res 145: result = self.getResult(res) 146: if result: 147: results.append(result) 148: restypes[result.type] = result.type 149: 150: res = curs.fetchone() 151: rescnt += 1 152: 153: curs.close() 154: #self.dbCon = None 155: 156: #print "SEARCH: ", rescnt, " results" 157: restypelist = restypes.keys() 158: return (results, restypelist) 159: 160: 161: def getResult(self, db_result, rank=0): 162: """factory for result objects""" 163: 164: (fileid, tagidx, tags, content) = db_result 165: res = None 166: 167: if tags.find('/meta/bib/') > -1: 168: res = BibResult(self, db_result, rank) 169: elif tags.find('/meta/archimedes/') > -1: 170: res = ArchimResult(self, db_result, rank) 171: else: 172: res = AnyResult(self, db_result, rank) 173: 174: return res 175: 176: 177: def renderResult(self, result): 178: """returns HTML rendering of a search result""" 179: 180: return result.render(self) 181: 182: 183: def filterResults(self, results, start, end, restypefilter=None): 184: """returns list of results that match a filter""" 185: # filter types first 186: if restypefilter: 187: res = [] 188: for r in results: 189: if r.type in restypefilter: 190: res.append(r) 191: else: 192: res = results 193: # filter on count 194: resgroup = res[start:end] 195: # new total count (because of filter) 196: rescnt = len(res) 197: 198: return (resgroup, rescnt) 199: 200: 201: # 202: # Web page stuff 203: # 204: 205: def index_html(self): 206: """metadata search""" 207: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self) 208: return pt() 209: 210: 211: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None): 212: """search and create result""" 213: sres = int(start) -1 214: lres = sres + count 215: try: 216: oldsearch = self.REQUEST.SESSION['searchstring'] 217: oldtype = self.REQUEST.SESSION['searchtype'] 218: except: 219: oldsearch = "" 220: oldtype = "" 221: 222: if not searchstring: 223: searchstring = oldsearch 224: searchtype = oldtype 225: 226: if not oldsearch or searchstring != oldsearch or searchtype != oldtype: 227: # new search 228: (res, restypes) = self.dbSearch(searchstring, searchtype) 229: # sort the result 230: res.sort(ranksort) 231: # store it 232: self.REQUEST.SESSION['results'] = res 233: self.REQUEST.SESSION['searchstring'] = searchstring 234: self.REQUEST.SESSION['searchtype'] = searchtype 235: self.REQUEST.SESSION['resulttypes'] = restypes 236: 237: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter) 238: lres = min(lres, nres) 239: self.REQUEST.SESSION['resultgroup'] = resgroup 240: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count)) 241: self.REQUEST.SESSION['res_type_filter'] = restypefilter 242: if nres > 0: 243: zpt = "zpt/searchResult.zpt" 244: else: 245: zpt = "zpt/searchResult_none.zpt" 246: 247: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self) 248: return pt() 249: 250: 251: def getSearchType(self): 252: """returns the last search type""" 253: try: 254: ret = self.REQUEST.SESSION['searchtype'] 255: except: 256: ret = "" 257: 258: return ret 259: 260: def getSearchString(self): 261: """returns the last search string""" 262: try: 263: ret = self.REQUEST.SESSION['searchstring'] 264: except: 265: ret = "" 266: 267: return ret 268: 269: 270: def hasNextResults(self): 271: """returns if there are more results""" 272: try: 273: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 274: return (first + count < total) 275: except: 276: return False 277: 278: def hasPrevResults(self): 279: """returns if there are previous results""" 280: try: 281: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 282: return (first > 1) 283: except: 284: return False 285: 286: 287: def nextResults(self): 288: """returns more results""" 289: try: 290: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 291: first = first + count 292: last = last + count 293: if first > total: 294: first = total 295: if last > total: 296: last = total 297: except: 298: print "OUCH: no next results!" 299: return self.search() 300: 301: return self.search(start=first, count=count) 302: 303: 304: def prevResults(self): 305: """returns more results""" 306: try: 307: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 308: first = first - count 309: last = last - count 310: if first < 1: 311: first = 1 312: if last < 1: 313: last = 1 314: except: 315: print "OUCH: no prev results!" 316: return self.search() 317: 318: return self.search(start=first, count=count) 319: 320: 321: 322: def manage_AddOSAS_searchForm(self): 323: """create Search form""" 324: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self) 325: return pt() 326: 327: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): 328: """add the OSAS_root""" 329: newObj=OSAS_search(id,title,dsn) 330: self._setObject(id,newObj) 331: if RESPONSE is not None: 332: RESPONSE.redirect('manage_main') 333: 334: 335: 336: 337: class SearchResult(SimpleItem): 338: """base search result object""" 339: 340: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0): 341: """init""" 342: # result type (e.g. "bib", "archim") 343: self.type = type 344: # index file name 345: self.file = file 346: # url for result (list of pairs) 347: if url: 348: self.urls = url 349: else: 350: self.urls = [] 351: # actual content (list of tuples) 352: self.content = content 353: # document status (e.g. "online", "archive") 354: self.status = None 355: # result rank for presentation 356: self.rank = rank 357: 358: class AnyResult(SearchResult): 359: """catch-all type result object""" 360: 361: def __init__(self, zope, db_result, rank): 362: """returns a catch-all type result""" 363: SearchResult.__init__(self) 364: #print "NEW ANY RESULT!" 365: self.type='unknown' 366: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt") 367: 368: (db_fileid, db_tagidx, db_tags, db_content) = db_result 369: self.hitTag = db_tags 370: 371: # get full info from db 372: self.fileinfo = zope.getDBFile(db_fileid) 373: assert self.fileinfo 374: 375: items = {} 376: items[db_tags] = db_content 377: self.content = items 378: self.file = self.fileinfo[0] 379: self.status = statusForFile(self.file) 380: self.rank = rank 381: 382: def getContentList(self): 383: """returns content as list of tuples in preferred order""" 384: l = [] 385: for k in self.content.keys(): 386: l.append((k, self.content[k])) 387: 388: return l 389: 390: def render(self, zope): 391: """render this result object""" 392: zope.REQUEST.SESSION['result'] = self 393: pt=PageTemplateFile(self.zptFile).__of__(zope) 394: return pt() 395: 396: 397: class MetaResult(AnyResult): 398: """result object that collects metadata""" 399: 400: def __init__(self, zope, db_result, rank): 401: """contructor""" 402: AnyResult.__init__(self, zope, db_result, rank) 403: #print "NEW META RESULT!" 404: 405: (fileid, tagidx, tags, content) = db_result 406: 407: # get full info from db 408: self.metainfo = zope.getDBFileMeta(fileid) 409: assert self.metainfo 410: 411: def checkContext(self, tags, content, ctxurl): 412: """takes meta entry and updates url from context tags""" 413: if tags.endswith('/context/link'): 414: if content: 415: #print "CTXlink: ", content 416: ctxurl[0] = content 417: 418: elif tags.endswith('/context/name'): 419: if content: 420: #print "CTXname: ", content 421: ctxurl[1] = content 422: 423: return ctxurl 424: 425: 426: class BibResult(MetaResult): 427: """bib type result object""" 428: 429: def __init__(self, zope, db_result, rank): 430: """constructor""" 431: MetaResult.__init__(self, zope, db_result, rank) 432: #print "NEW BIB RESULT!", self 433: self.type = "bib" 434: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt") 435: url = storageURL(self.file) 436: if url: 437: self.urls.append(url) 438: (fileid, tagidx, tags, content) = db_result 439: 440: btype = "" 441: bitems = {} 442: ctxurl = ['', ''] 443: 444: for me in self.metainfo: 445: (m_idx, m_tags, m_content, m_attributes) = me 446: # context tag 447: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 448: # first tag with bib type attribute 449: if m_tags.endswith('/meta/bib'): 450: r = re.search('type="([^"]*)"', m_attributes) 451: if r: 452: btype = r.group(1) 453: 454: if not btype: 455: btype = "*unknown*" 456: 457: bitems['type'] = btype 458: continue 459: 460: # skip other tags 461: if not btype: continue 462: 463: # collect bib/something 464: r = re.search('/meta/bib/(.*)', m_tags) 465: if r: 466: k = r.group(1) 467: #print "CONTENT: ", m_content 468: bitems[k] = m_content 469: # remember hit tag 470: if m_tags == self.hitTag: 471: self.hitTag = k 472: continue 473: 474: self.content = bitems 475: # store context 476: if not ctxurl[1]: 477: ctxurl[1] = "View" 478: # must have link 479: if ctxurl[0]: 480: self.urls.append(ctxurl) 481: 482: self.rank += 100 483: 484: def getContentList(self): 485: """returns content as list of tuples in preferred order""" 486: l = [] 487: c = self.content.copy() 488: # preferred items first 489: for k in ('author', 'title', 'journal', 'year'): 490: if c.has_key(k): 491: l.append((k, c[k])) 492: del c[k] 493: 494: # no type 495: del c['type'] 496: # copy the rest 497: for k in c.keys(): 498: l.append((k, c[k])) 499: 500: return l 501: 502: 503: class ArchimResult(MetaResult): 504: """archimedes type result object""" 505: 506: def __init__(self, zope, db_result, rank): 507: """constructor""" 508: MetaResult.__init__(self, zope, db_result, rank) 509: #print "NEW ARCHIM RESULT!", self 510: self.type = "archim" 511: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt") 512: url = storageURL(self.file) 513: if url: 514: self.urls.append(url) 515: 516: (fileid, tagidx, tags, content) = db_result 517: 518: # process info 519: bitems = {} 520: ctxurl = ['', ''] 521: for me in self.metainfo: 522: (m_idx, m_tags, m_content, m_attributes) = me 523: # context tag 524: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 525: # collect archimedes/something 526: r = re.search('/meta/archimedes/(.*)', m_tags) 527: if r: 528: k = r.group(1) 529: #print "CONTENT: ", m_content 530: bitems[k] = m_content 531: # remember hit tag 532: if m_tags == self.hitTag: 533: self.hitTag = k 534: continue 535: 536: self.content = bitems 537: self.rank += 100 538: # store context 539: if not ctxurl[1]: 540: ctxurl[1] = "View" 541: # must have link 542: if ctxurl[0]: 543: self.urls.append(ctxurl) 544: 545: 546: def getContentList(self): 547: """returns content as list of tuples in preferred order""" 548: l = [] 549: c = self.content.copy() 550: # preferred items first 551: for k in ('author', 'title', 'date', 'place'): 552: if c.has_key(k): 553: l.append((k, c[k])) 554: del c[k] 555: 556: # copy the rest 557: for k in c.keys(): 558: l.append((k, c[k])) 559: 560: return l 561: 562: 563: 564: 565: def ranksort(res1, res2): 566: """sort results on rank""" 567: return cmp(res2.rank, res1.rank) 568: 569: 570: def statusForFile(filename): 571: """heuristic... returns status for a index file name""" 572: status = None 573: if filename.startswith('/mpiwg/online/'): 574: status = "online" 575: elif filename.startswith('/mpiwg/archive/'): 576: status = "archive" 577: elif filename.startswith('http://'): 578: status = "database" 579: 580: return status 581: 582: def storageURL(filename): 583: """heuristic... returns an URL for a index file name""" 584: url = None 585: name = None 586: if filename.startswith('/mpiwg/online/'): 587: #print "URLFORFILE: online ", filename 588: r = re.search('^(.*)/index.meta', filename) 589: if r: 590: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1) 591: name = "Storage System" 592: 593: if name and url: 594: return (url, name) 595: 596: return None 597: