OSAS/OSA_system/OSAS_search.py - view

File: [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.8: download - view: text, annotated - select for diffs - revision graph
Fri Jan 19 17:16:25 2007 UTC (17 years, 5 months ago) by casties
Branches: MAIN
CVS tags: HEAD

fixed PgSQL imports to work with psycopg2

1: """Metadata search interface 2: ROC 2004, itgroup 3: 4: """ 5: 6: from AccessControl import ClassSecurityInfo 7: from Globals import InitializeClass 8: from Globals import Persistent, package_home 9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile 10: from Products.PageTemplates.PageTemplate import PageTemplate 11: from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate 12: from OFS.Folder import Folder 13: from OFS.SimpleItem import SimpleItem 14: try: 15: import psycopg2 as PgSQL 16: except: 17: try: 18: import psycopg as PgSQL 19: except: 20: from pyPgSQL import PgSQL 21: 22: import re 23: import os 24: 25: MAXHITS = 1000 26: 27: class OSAS_search(Folder): 28: """Object for global metadata search""" 29: 30: meta_type="OSAS_search" 31: 32: manage_options=Folder.manage_options+( 33: {'label':'Main config','action':'manage_ChangeOSAS_searchForm'}, 34: ) 35: 36: 37: def __init__(self,id,title,dsn=None): 38: """init""" 39: self.id=id 40: self.title=title 41: if dsn: 42: self.dsn = dsn 43: else: 44: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread" 45: # volatile database connection object 46: self._v_dbCon = None 47: self._v_tryCon = 0 48: 49: 50: def dbCursor(self): 51: """returns new SQL cursor object""" 52: curs = None 53: if hasattr(self, '_v_dbCon') and self._v_dbCon is not None: 54: try: 55: curs = self._v_dbCon.cursor() 56: self._v_tryCon = 0 57: except: 58: # in case of problems reset dbCon 59: self._v_dbCon = None 60: self._v_tryCon += 1 61: else: 62: self._v_dbCon = None 63: self._v_tryCon = 0 64: 65: if not curs and self._v_tryCon < 3: 66: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0) 67: # call ourself with the new connection 68: curs = self.dbCursor() 69: 70: assert curs, "AIIEE no db cursor!!" 71: return curs 72: 73: def getDBFileMeta(self, fileid): 74: """returns an array with all meta entries of fileid""" 75: 76: metacache = {} 77: # try in cache 78: if self.REQUEST.SESSION.has_key('dbMeta'): 79: metacache = self.REQUEST.SESSION['dbMeta'] 80: if metacache.has_key(fileid): 81: res = metacache[fileid] 82: #print "meta from cache " 83: return res 84: 85: curs = self.dbCursor() 86: 87: sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx' 88: print sql, " -> ", fileid 89: curs.execute(sql, {'id':fileid}) 90: print "done" 91: 92: res = curs.fetchall() 93: #print "res:", res 94: curs.close() 95: # store info in cache 96: metacache[fileid] = res 97: self.REQUEST.SESSION['dbMeta'] = metacache 98: 99: return res 100: 101: def getDBFile(self, fileid): 102: """returns the file information of fileid""" 103: 104: filecache = {} 105: # try in cache 106: if self.REQUEST.SESSION.has_key('dbFiles'): 107: filecache = self.REQUEST.SESSION['dbFiles'] 108: if filecache.has_key(fileid): 109: res = filecache[fileid] 110: #print "file from cache " 111: return res 112: 113: curs = self.dbCursor() 114: 115: sql = 'select filename,mtime from files where id=%(id)s' 116: print 'DBFILE: ', sql, " -> ", fileid 117: curs.execute(sql, {'id':fileid}) 118: print "DBFILE: done" 119: 120: res = curs.fetchone() 121: #print "DBFILE: res:", res 122: curs.close() 123: # store info in cache 124: filecache[fileid] = res 125: self.REQUEST.SESSION['dbFiles'] = filecache 126: 127: return res 128: 129: 130: def dbSearch(self, query, type): 131: """search DB for query and return result set""" 132: results = [] 133: restypes = {} 134: if not query: 135: # empty query 136: return results 137: 138: curs = self.dbCursor() 139: if type == 'equals': 140: qs = query 141: elif type == 'startswith': 142: qs = query + "%" 143: elif type == 'contains': 144: qs = "%" + query + "%" 145: 146: sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)' 147: print sql, " -> ", qs 148: curs.execute(sql, {'qs':qs}) 149: print "done" 150: res = curs.fetchone() 151: rescnt = 1 152: #print "res0:", res 153: while res and rescnt < MAXHITS: 154: #print "res:", res 155: result = self.getResult(res) 156: if result: 157: results.append(result) 158: restypes[result.type] = result.type 159: 160: res = curs.fetchone() 161: rescnt += 1 162: 163: curs.close() 164: #self.dbCon = None 165: 166: #print "SEARCH: ", rescnt, " results" 167: restypelist = restypes.keys() 168: return (results, restypelist) 169: 170: 171: def getResult(self, db_result, rank=0): 172: """factory for result objects""" 173: 174: (fileid, tagidx, tags, content) = db_result 175: res = None 176: 177: if tags.find('/meta/bib/') > -1: 178: res = BibResult(self, db_result, rank) 179: elif tags.find('/meta/archimedes/') > -1: 180: res = ArchimResult(self, db_result, rank) 181: else: 182: res = AnyResult(self, db_result, rank) 183: 184: return res 185: 186: 187: def renderResult(self, result): 188: """returns HTML rendering of a search result""" 189: 190: return result.render(self) 191: 192: 193: def filterResults(self, results, start, end, restypefilter=None): 194: """returns list of results that match a filter""" 195: # filter types first 196: if restypefilter: 197: res = [] 198: for r in results: 199: if r.type == restypefilter: 200: res.append(r) 201: else: 202: res = results 203: # new total count (because of filter) 204: rescnt = len(res) 205: # filter on count 206: resgroup = res[start:end] 207: 208: return (resgroup, rescnt) 209: 210: 211: # 212: # Web page stuff 213: # 214: 215: def index_html(self): 216: """metadata search""" 217: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self) 218: return pt() 219: 220: 221: def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None): 222: """search and create result""" 223: sres = int(start) -1 224: lres = sres + count 225: try: 226: oldsearch = self.REQUEST.SESSION['searchstring'] 227: oldtype = self.REQUEST.SESSION['searchtype'] 228: except: 229: oldsearch = "" 230: oldtype = "" 231: 232: if not searchstring: 233: searchstring = oldsearch 234: searchtype = oldtype 235: 236: if not oldsearch or searchstring != oldsearch or searchtype != oldtype: 237: # new search 238: (res, restypes) = self.dbSearch(searchstring, searchtype) 239: # sort the result 240: res.sort(ranksort) 241: # store it 242: self.REQUEST.SESSION['results'] = res 243: self.REQUEST.SESSION['searchstring'] = searchstring 244: self.REQUEST.SESSION['searchtype'] = searchtype 245: self.REQUEST.SESSION['resulttypes'] = restypes 246: 247: (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter) 248: lres = min(lres, nres) 249: sres = min(sres, nres) 250: self.REQUEST.SESSION['resultgroup'] = resgroup 251: self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count)) 252: self.REQUEST.SESSION['res_type_filter'] = restypefilter 253: if nres > 0: 254: zpt = "zpt/searchResult.zpt" 255: else: 256: zpt = "zpt/searchResult_none.zpt" 257: 258: pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self) 259: return pt() 260: 261: 262: def getSearchType(self): 263: """returns the last search type""" 264: try: 265: ret = self.REQUEST.SESSION['searchtype'] 266: except: 267: ret = "" 268: 269: return ret 270: 271: def getSearchString(self): 272: """returns the last search string""" 273: try: 274: ret = self.REQUEST.SESSION['searchstring'] 275: except: 276: ret = "" 277: 278: return ret 279: 280: 281: def hasNextResults(self): 282: """returns if there are more results""" 283: try: 284: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 285: return (first + count < total) 286: except: 287: return False 288: 289: def hasPrevResults(self): 290: """returns if there are previous results""" 291: try: 292: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 293: return (first > 1) 294: except: 295: return False 296: 297: 298: def nextResults(self): 299: """returns more results""" 300: try: 301: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 302: first = first + count 303: last = last + count 304: if first > total: 305: first = total 306: if last > total: 307: last = total 308: except: 309: print "OUCH: no next results!" 310: return self.search() 311: 312: return self.search(start=first, count=count) 313: 314: 315: def prevResults(self): 316: """returns more results""" 317: try: 318: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] 319: first = first - count 320: last = last - count 321: if first < 1: 322: first = 1 323: if last < 1: 324: last = 1 325: except: 326: print "OUCH: no prev results!" 327: return self.search() 328: 329: return self.search(start=first, count=count) 330: 331: 332: def manage_ChangeOSAS_searchForm(self): 333: """create Search form""" 334: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self) 335: return pt() 336: 337: def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): 338: """add the OSAS_root""" 339: self.id = id 340: self.title = title 341: self.dsn = dsn 342: if RESPONSE is not None: 343: RESPONSE.redirect('manage_main') 344: 345: 346: def manage_AddOSAS_searchForm(self): 347: """create Search form""" 348: pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self) 349: return pt() 350: 351: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): 352: """add the OSAS_root""" 353: newObj=OSAS_search(id,title,dsn) 354: self._setObject(id,newObj) 355: if RESPONSE is not None: 356: RESPONSE.redirect('manage_main') 357: 358: 359: 360: 361: class SearchResult(SimpleItem): 362: """base search result object""" 363: 364: def __init__(self, type='unknown', file=None, url=None, content=None, rank=0): 365: """init""" 366: # result type (e.g. "bib", "archim") 367: self.type = type 368: # index file name 369: self.file = file 370: # url for result (list of pairs) 371: if url: 372: self.urls = url 373: else: 374: self.urls = [] 375: # actual content (list of tuples) 376: self.content = content 377: # document status (e.g. "online", "archive") 378: self.status = None 379: # result rank for presentation 380: self.rank = rank 381: 382: class AnyResult(SearchResult): 383: """catch-all type result object""" 384: 385: def __init__(self, zope, db_result, rank): 386: """returns a catch-all type result""" 387: SearchResult.__init__(self) 388: #print "NEW ANY RESULT!" 389: self.type='unknown' 390: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt") 391: 392: (db_fileid, db_tagidx, db_tags, db_content) = db_result 393: self.hitTag = db_tags 394: 395: # get full info from db 396: self.fileinfo = zope.getDBFile(db_fileid) 397: assert self.fileinfo 398: 399: items = {} 400: items[db_tags] = db_content 401: self.content = items 402: self.file = self.fileinfo[0] 403: self.status = statusForFile(self.file) 404: self.rank = rank 405: 406: def getContentList(self): 407: """returns content as list of tuples in preferred order""" 408: l = [] 409: for k in self.content.keys(): 410: l.append((k, self.content[k])) 411: 412: return l 413: 414: def render(self, zope): 415: """render this result object""" 416: zope.REQUEST.SESSION['result'] = self 417: pt=PageTemplateFile(self.zptFile).__of__(zope) 418: return pt() 419: 420: 421: class MetaResult(AnyResult): 422: """result object that collects metadata""" 423: 424: def __init__(self, zope, db_result, rank): 425: """contructor""" 426: AnyResult.__init__(self, zope, db_result, rank) 427: #print "NEW META RESULT!" 428: 429: (fileid, tagidx, tags, content) = db_result 430: 431: # get full info from db 432: self.metainfo = zope.getDBFileMeta(fileid) 433: assert self.metainfo 434: 435: def checkContext(self, tags, content, ctxurl): 436: """takes meta entry and updates url from context tags""" 437: if tags.endswith('/context/link'): 438: if content: 439: #print "CTXlink: ", content 440: ctxurl[0] = content 441: 442: elif tags.endswith('/context/name'): 443: if content: 444: #print "CTXname: ", content 445: ctxurl[1] = content 446: 447: return ctxurl 448: 449: 450: class BibResult(MetaResult): 451: """bib type result object""" 452: 453: def __init__(self, zope, db_result, rank): 454: """constructor""" 455: MetaResult.__init__(self, zope, db_result, rank) 456: #print "NEW BIB RESULT!", self 457: self.type = "bib" 458: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt") 459: url = storageURL(self.file) 460: if url: 461: self.urls.append(url) 462: (fileid, tagidx, tags, content) = db_result 463: 464: btype = "" 465: bitems = {} 466: ctxurl = ['', ''] 467: 468: for me in self.metainfo: 469: (m_idx, m_tags, m_content, m_attributes) = me 470: # context tag 471: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 472: # first tag with bib type attribute 473: if m_tags.endswith('/meta/bib'): 474: r = re.search('type="([^"]*)"', m_attributes) 475: if r: 476: btype = r.group(1) 477: 478: if not btype: 479: btype = "*unknown*" 480: 481: bitems['type'] = btype 482: continue 483: 484: # skip other tags 485: if not btype: continue 486: 487: # collect bib/something 488: r = re.search('/meta/bib/(.*)', m_tags) 489: if r: 490: k = r.group(1) 491: #print "CONTENT: ", m_content 492: bitems[k] = m_content 493: # remember hit tag 494: if m_tags == self.hitTag: 495: self.hitTag = k 496: continue 497: 498: self.content = bitems 499: # store context 500: if not ctxurl[1]: 501: ctxurl[1] = "View" 502: # must have link 503: if ctxurl[0]: 504: self.urls.append(ctxurl) 505: 506: self.rank += 100 507: 508: def getContentList(self): 509: """returns content as list of tuples in preferred order""" 510: l = [] 511: c = self.content.copy() 512: # preferred items first 513: for k in ('author', 'title', 'journal', 'year'): 514: if c.has_key(k): 515: l.append((k, c[k])) 516: del c[k] 517: 518: # no type 519: del c['type'] 520: # copy the rest 521: for k in c.keys(): 522: l.append((k, c[k])) 523: 524: return l 525: 526: 527: class ArchimResult(MetaResult): 528: """archimedes type result object""" 529: 530: def __init__(self, zope, db_result, rank): 531: """constructor""" 532: MetaResult.__init__(self, zope, db_result, rank) 533: #print "NEW ARCHIM RESULT!", self 534: self.type = "archim" 535: self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt") 536: url = storageURL(self.file) 537: if url: 538: self.urls.append(url) 539: 540: (fileid, tagidx, tags, content) = db_result 541: 542: # process info 543: bitems = {} 544: ctxurl = ['', ''] 545: for me in self.metainfo: 546: (m_idx, m_tags, m_content, m_attributes) = me 547: # context tag 548: ctxurl = self.checkContext(m_tags, m_content, ctxurl) 549: # collect archimedes/something 550: r = re.search('/meta/archimedes/(.*)', m_tags) 551: if r: 552: k = r.group(1) 553: #print "CONTENT: ", m_content 554: bitems[k] = m_content 555: # remember hit tag 556: if m_tags == self.hitTag: 557: self.hitTag = k 558: continue 559: 560: self.content = bitems 561: self.rank += 100 562: # store context 563: if not ctxurl[1]: 564: ctxurl[1] = "View" 565: # must have link 566: if ctxurl[0]: 567: self.urls.append(ctxurl) 568: 569: 570: def getContentList(self): 571: """returns content as list of tuples in preferred order""" 572: l = [] 573: c = self.content.copy() 574: # preferred items first 575: for k in ('author', 'title', 'date', 'place'): 576: if c.has_key(k): 577: l.append((k, c[k])) 578: del c[k] 579: 580: # copy the rest 581: for k in c.keys(): 582: l.append((k, c[k])) 583: 584: return l 585: 586: 587: 588: 589: def ranksort(res1, res2): 590: """sort results on rank""" 591: return cmp(res2.rank, res1.rank) 592: 593: 594: def statusForFile(filename): 595: """heuristic... returns status for a index file name""" 596: status = None 597: if filename.startswith('/mpiwg/online/'): 598: status = "online" 599: elif filename.startswith('/mpiwg/archive/'): 600: status = "archive" 601: elif filename.startswith('http://'): 602: status = "database" 603: 604: return status 605: 606: def storageURL(filename): 607: """heuristic... returns an URL for a index file name""" 608: url = None 609: name = None 610: if filename.startswith('/mpiwg/online/'): 611: #print "URLFORFILE: online ", filename 612: r = re.search('^(.*)/index.meta', filename) 613: if r: 614: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1) 615: name = "Storage System" 616: 617: elif filename.startswith('http://'): 618: #print "URLFORFILE: url ", filename 619: url = filename 620: name = "Online Database" 621: 622: if name and url: 623: return (url, name) 624: 625: return None 626: