OSAS/OSA_system/OSAS_search.py - view

File: [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Fri Jul 9 17:56:14 2004 UTC (19 years, 11 months ago) by casties
Branches: MAIN
CVS tags: HEAD

small paranoia fix

1: """Metadata search interface 2: ROC 2004, itgroup 3: 4: """ 5: 6: from AccessControl import ClassSecurityInfo 7: from Globals import InitializeClass 8: from Globals import Persistent, package_home 9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile 10: from Products.PageTemplates.PageTemplate import PageTemplate 11: from OFS.SimpleItem import SimpleItem 12: #from pyPgSQL import PgSQL 13: import psycopg as PgSQL 14: 15: import re 16: import os 17: 18: MAXHITS = 1000 19: 20: class OSAS_search(SimpleItem): 21: """Object for global metadata search""" 22: 23: meta_type="OSAS_search" 24: 25: 26: 27: def __init__(self,id,title,dsn=None): 28: """init""" 29: self.id=id 30: self.title=title 31: if dsn: 32: self.dsn = dsn 33: else: 34: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread" 35: # volatile database connection object 36: self._v_dbCon = None 37: self._v_tryCon = 0 38: 39: 40: def dbCursor(self): 41: """returns new SQL cursor object""" 42: curs = None 43: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None: 44: try: 45: curs = self._v_dbCon.cursor() 46: self._v_tryCon = 0 47: except: 48: # in case of problems reset dbCon 49: self._v_dbCon = None 50: self._v_tryCon += 1 51: else: 52: self._v_dbCon = None 53: self._v_tryCon = 0 54: 55: if not curs and self._v_tryCon < 3: 56: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0) 57: # call ourself with the new connection 58: curs = self.dbCursor() 59: 60: assert curs, "AIIEE no db cursor!!" 61: return curs 62: 63: def getDBFileMeta(self, fileid): 64: """returns an array with all meta entries of fileid""" 65: 66: metacache = {} 67: # try in cache 68: if self.REQUEST.SESSION.has_key('dbMeta'): 69: metacache = self.REQUEST.SESSION['dbMeta'] 70: if metacache.has_key(fileid): 71: res = metacache[fileid] 72: #print "meta from cache " 73: return res 74: 75: curs = self.dbCursor() 76: 77: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx' 78: print sql, " -> ", fileid 79: curs.execute(sql, {'id':fileid}) 80: print "done" 81: 82: res = curs.fetchall() 83: #print "res:", res 84: curs.close() 85: # store info in cache 86: metacache[fileid] = res 87: self.REQUEST.SESSION['dbMeta'] = metacache 88: 89: return res 90: 91: def getDBFile(self, fileid): 92: """returns the file information of fileid""" 93: 94: filecache = {} 95: # try in cache 96: if self.REQUEST.SESSION.has_key('dbFiles'): 97: filecache = self.REQUEST.SESSION['dbFiles'] 98: if filecache.has_key(fileid): 99: res = filecache[fileid] 100: #print "file from cache " 101: return res 102: 103: curs = self.dbCursor() 104: 105: sql = 'select filename,mtime from files where id=%(id)s' 106: print 'DBFILE: ', sql, " -> ", fileid 107: curs.execute(sql, {'id':fileid}) 108: print "DBFILE: done" 109: 110: res = curs.fetchone() 111: #print "DBFILE: res:", res 112: curs.close() 113: # store info in cache 114: filecache[fileid] = res 115: self.REQUEST.SESSION['dbFiles'] = filecache 116: 117: return res 118: 119: 120: def dbSearch(self, query, type): 121: """search DB for query and return result set""" 122: results = [] 123: restypes = {} 124: if not query: 125: # empty query 126: return results 127: 128: curs = self.dbCursor() 129: if type == 'equals': 130: qs = query 131: elif type == 'startswith': 132: qs = query + "%" 133: elif type == 'contains': 134: qs = "%" + query + "%" 135: 136: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)' 137: print sql, " -> ", qs 138: curs.execute(sql, {'qs':qs}) 139: print "done" 140: res = curs.fetchone() 141: rescnt = 1 142: #print "res0:", res 143: while res and rescnt < MAXHITS: 144: #print "res:", res 145: result = self.getResult(res) 146: if result: 147: results.append(result) 148: restypes[result.type] = result.type 149: 150: res = curs.fetchone() 151: rescnt += 1 152: 153: curs.close() 154: #self.dbCon = None 155: 156: #print "SEARCH: ", rescnt, " results" 157: restypelist = restypes.keys() 158: return (results, restypelist) 159: 160: 161: def getResult(self, db_result, rank=0): 162: """factory for result objects""" 163: 164: (fileid, tagidx, tags, content) = db_result 165: res = None 166: 167: if tags.find('/meta/bib/') > -1: 168: res = BibResult(self, db_result, rank) 169: elif tags.find('/meta/archimedes/') > -1: 170: res = ArchimResult(self, db_result, rank) 171: else: 172: res = AnyResult(self, db_result, rank) 173: 174: return res 175: 176: 177: def renderResult(self, result): 178: """returns HTML rendering of a search result""" 179: 180: return result.render(self) 181: 182: 183: def filterResults(self, results, start, end, restypefilter=None): 184: """returns list of results that match a filter""" 185: # filter types first 186: if restypefilter: 187: res = [] 188: for r in results: 189: if r.type in restypefilter: 190: res.append(r) 191: else: 192: res = results 193: # new total count (because of filter) 194: rescnt = len(res) 195: # filter on count 196: resgroup = res[start:end] 197: 198: return (resgroup, rescnt) 199: 200: 201: # 202: # Web page stuff 203: # 204: 205: def index_html(self): 206: """metadata search""" 207: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self) 208: return pt() 209: 210: 211: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None): 212: """search and create result""" 213: sres = int(start) -1 214: lres = sres + count 215: try: 216: oldsearch = self.REQUEST.SESSION['searchstring'] 217: oldtype = self.REQUEST.SESSION['searchtype'] 218: except: 219: oldsearch = "" 220: oldtype = "" 221: 222: if not searchstring: 223: searchstring = oldsearch 224: searchtype = oldtype 225: 226: if not oldsearch or searchstring != oldsearch or searchtype != oldtype: 227: # new search 228: (res, restypes) = self.dbSearch(searchstring, searchtype) 229: # sort the result 230: res.sort(ranksort) 231: # store it 232: self.REQUEST.SESSION['results'] = res 233: self.REQUEST.SESSION['searchstring'] = searchstring 234: self.REQUEST.SESSION['searchtype'] = searchtype 235: self.REQUEST.SESSION['resulttypes'] = restypes 236: 237: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter) 238: lres = min(lres, nres) 239: sres = min(sres, nres) 240: self.REQUEST.SESSION['resultgroup'] = resgroup 241: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count)) 242: self.REQUEST.SESSION['res_type_filter'] = restypefilter 243: if nres > 0: 244: zpt = "zpt/searchResult.zpt" 245: else: 246: zpt = "zpt/searchResult_none.zpt" 247: 248: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self) 249: return pt() 250: 251: 252: def getSearchType(self): 253: """returns the last search type""" 254: try: 255: ret = self.REQUEST.SESSION['searchtype'] 256: except: 257: ret = "" 258: 259: return ret 260: 261: def getSearchString(self): 262: """returns the last search string""" 263: try: 264: ret = self.REQUEST.SESSION['searchstring'] 265: except: 266: ret = "" 267: 268: return ret 269: 270: 271: def hasNextResults(self): 272: """returns if there are more results""" 273: try: 274: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 275: return (first + count < total) 276: except: 277: return False 278: 279: def hasPrevResults(self): 280: """returns if there are previous results""" 281: try: 282: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 283: return (first > 1) 284: except: 285: return False 286: 287: 288: def nextResults(self): 289: """returns more results""" 290: try: 291: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 292: first = first + count 293: last = last + count 294: if first > total: 295: first = total 296: if last > total: 297: last = total 298: except: 299: print "OUCH: no next results!" 300: return self.search() 301: 302: return self.search(start=first, count=count) 303: 304: 305: def prevResults(self): 306: """returns more results""" 307: try: 308: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 309: first = first - count 310: last = last - count 311: if first < 1: 312: first = 1 313: if last < 1: 314: last = 1 315: except: 316: print "OUCH: no prev results!" 317: return self.search() 318: 319: return self.search(start=first, count=count) 320: 321: 322: 323: def manage_AddOSAS_searchForm(self): 324: """create Search form""" 325: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self) 326: return pt() 327: 328: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): 329: """add the OSAS_root""" 330: newObj=OSAS_search(id,title,dsn) 331: self._setObject(id,newObj) 332: if RESPONSE is not None: 333: RESPONSE.redirect('manage_main') 334: 335: 336: 337: 338: class SearchResult(SimpleItem): 339: """base search result object""" 340: 341: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0): 342: """init""" 343: # result type (e.g. "bib", "archim") 344: self.type = type 345: # index file name 346: self.file = file 347: # url for result (list of pairs) 348: if url: 349: self.urls = url 350: else: 351: self.urls = [] 352: # actual content (list of tuples) 353: self.content = content 354: # document status (e.g. "online", "archive") 355: self.status = None 356: # result rank for presentation 357: self.rank = rank 358: 359: class AnyResult(SearchResult): 360: """catch-all type result object""" 361: 362: def __init__(self, zope, db_result, rank): 363: """returns a catch-all type result""" 364: SearchResult.__init__(self) 365: #print "NEW ANY RESULT!" 366: self.type='unknown' 367: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt") 368: 369: (db_fileid, db_tagidx, db_tags, db_content) = db_result 370: self.hitTag = db_tags 371: 372: # get full info from db 373: self.fileinfo = zope.getDBFile(db_fileid) 374: assert self.fileinfo 375: 376: items = {} 377: items[db_tags] = db_content 378: self.content = items 379: self.file = self.fileinfo[0] 380: self.status = statusForFile(self.file) 381: self.rank = rank 382: 383: def getContentList(self): 384: """returns content as list of tuples in preferred order""" 385: l = [] 386: for k in self.content.keys(): 387: l.append((k, self.content[k])) 388: 389: return l 390: 391: def render(self, zope): 392: """render this result object""" 393: zope.REQUEST.SESSION['result'] = self 394: pt=PageTemplateFile(self.zptFile).__of__(zope) 395: return pt() 396: 397: 398: class MetaResult(AnyResult): 399: """result object that collects metadata""" 400: 401: def __init__(self, zope, db_result, rank): 402: """contructor""" 403: AnyResult.__init__(self, zope, db_result, rank) 404: #print "NEW META RESULT!" 405: 406: (fileid, tagidx, tags, content) = db_result 407: 408: # get full info from db 409: self.metainfo = zope.getDBFileMeta(fileid) 410: assert self.metainfo 411: 412: def checkContext(self, tags, content, ctxurl): 413: """takes meta entry and updates url from context tags""" 414: if tags.endswith('/context/link'): 415: if content: 416: #print "CTXlink: ", content 417: ctxurl[0] = content 418: 419: elif tags.endswith('/context/name'): 420: if content: 421: #print "CTXname: ", content 422: ctxurl[1] = content 423: 424: return ctxurl 425: 426: 427: class BibResult(MetaResult): 428: """bib type result object""" 429: 430: def __init__(self, zope, db_result, rank): 431: """constructor""" 432: MetaResult.__init__(self, zope, db_result, rank) 433: #print "NEW BIB RESULT!", self 434: self.type = "bib" 435: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt") 436: url = storageURL(self.file) 437: if url: 438: self.urls.append(url) 439: (fileid, tagidx, tags, content) = db_result 440: 441: btype = "" 442: bitems = {} 443: ctxurl = ['', ''] 444: 445: for me in self.metainfo: 446: (m_idx, m_tags, m_content, m_attributes) = me 447: # context tag 448: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 449: # first tag with bib type attribute 450: if m_tags.endswith('/meta/bib'): 451: r = re.search('type="([^"]*)"', m_attributes) 452: if r: 453: btype = r.group(1) 454: 455: if not btype: 456: btype = "*unknown*" 457: 458: bitems['type'] = btype 459: continue 460: 461: # skip other tags 462: if not btype: continue 463: 464: # collect bib/something 465: r = re.search('/meta/bib/(.*)', m_tags) 466: if r: 467: k = r.group(1) 468: #print "CONTENT: ", m_content 469: bitems[k] = m_content 470: # remember hit tag 471: if m_tags == self.hitTag: 472: self.hitTag = k 473: continue 474: 475: self.content = bitems 476: # store context 477: if not ctxurl[1]: 478: ctxurl[1] = "View" 479: # must have link 480: if ctxurl[0]: 481: self.urls.append(ctxurl) 482: 483: self.rank += 100 484: 485: def getContentList(self): 486: """returns content as list of tuples in preferred order""" 487: l = [] 488: c = self.content.copy() 489: # preferred items first 490: for k in ('author', 'title', 'journal', 'year'): 491: if c.has_key(k): 492: l.append((k, c[k])) 493: del c[k] 494: 495: # no type 496: del c['type'] 497: # copy the rest 498: for k in c.keys(): 499: l.append((k, c[k])) 500: 501: return l 502: 503: 504: class ArchimResult(MetaResult): 505: """archimedes type result object""" 506: 507: def __init__(self, zope, db_result, rank): 508: """constructor""" 509: MetaResult.__init__(self, zope, db_result, rank) 510: #print "NEW ARCHIM RESULT!", self 511: self.type = "archim" 512: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt") 513: url = storageURL(self.file) 514: if url: 515: self.urls.append(url) 516: 517: (fileid, tagidx, tags, content) = db_result 518: 519: # process info 520: bitems = {} 521: ctxurl = ['', ''] 522: for me in self.metainfo: 523: (m_idx, m_tags, m_content, m_attributes) = me 524: # context tag 525: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 526: # collect archimedes/something 527: r = re.search('/meta/archimedes/(.*)', m_tags) 528: if r: 529: k = r.group(1) 530: #print "CONTENT: ", m_content 531: bitems[k] = m_content 532: # remember hit tag 533: if m_tags == self.hitTag: 534: self.hitTag = k 535: continue 536: 537: self.content = bitems 538: self.rank += 100 539: # store context 540: if not ctxurl[1]: 541: ctxurl[1] = "View" 542: # must have link 543: if ctxurl[0]: 544: self.urls.append(ctxurl) 545: 546: 547: def getContentList(self): 548: """returns content as list of tuples in preferred order""" 549: l = [] 550: c = self.content.copy() 551: # preferred items first 552: for k in ('author', 'title', 'date', 'place'): 553: if c.has_key(k): 554: l.append((k, c[k])) 555: del c[k] 556: 557: # copy the rest 558: for k in c.keys(): 559: l.append((k, c[k])) 560: 561: return l 562: 563: 564: 565: 566: def ranksort(res1, res2): 567: """sort results on rank""" 568: return cmp(res2.rank, res1.rank) 569: 570: 571: def statusForFile(filename): 572: """heuristic... returns status for a index file name""" 573: status = None 574: if filename.startswith('/mpiwg/online/'): 575: status = "online" 576: elif filename.startswith('/mpiwg/archive/'): 577: status = "archive" 578: elif filename.startswith('http://'): 579: status = "database" 580: 581: return status 582: 583: def storageURL(filename): 584: """heuristic... returns an URL for a index file name""" 585: url = None 586: name = None 587: if filename.startswith('/mpiwg/online/'): 588: #print "URLFORFILE: online ", filename 589: r = re.search('^(.*)/index.meta', filename) 590: if r: 591: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1) 592: name = "Storage System" 593: 594: elif filename.startswith('http://'): 595: #print "URLFORFILE: url ", filename 596: url = filename 597: name = "Online Database" 598: 599: if name and url: 600: return (url, name) 601: 602: return None 603: