OSAS/OSA_system/OSAS_search.py - view

File: [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.8: download - view: text, annotated - select for diffs - revision graph
Fri Jan 19 17:16:25 2007 UTC (17 years, 4 months ago) by casties
Branches: MAIN
CVS tags: HEAD

fixed PgSQL imports to work with psycopg2

"""Metadata search interface ROC 2004, itgroup """ from AccessControl import ClassSecurityInfo from Globals import InitializeClass from Globals import Persistent, package_home from Products.PageTemplates.PageTemplateFile import PageTemplateFile from Products.PageTemplates.PageTemplate import PageTemplate from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate from OFS.Folder import Folder from OFS.SimpleItem import SimpleItem try: import psycopg2 as PgSQL except: try: import psycopg as PgSQL except: from pyPgSQL import PgSQL import re import os MAXHITS = 1000 class OSAS_search(Folder): """Object for global metadata search""" meta_type="OSAS_search" manage_options=Folder.manage_options+( {'label':'Main config','action':'manage_ChangeOSAS_searchForm'}, ) def __init__(self,id,title,dsn=None): """init""" self.id=id self.title=title if dsn: self.dsn = dsn else: self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread" # volatile database connection object self._v_dbCon = None self._v_tryCon = 0 def dbCursor(self): """returns new SQL cursor object""" curs = None if hasattr(self, '_v_dbCon') and self._v_dbCon is not None: try: curs = self._v_dbCon.cursor() self._v_tryCon = 0 except: # in case of problems reset dbCon self._v_dbCon = None self._v_tryCon += 1 else: self._v_dbCon = None self._v_tryCon = 0 if not curs and self._v_tryCon < 3: self._v_dbCon = PgSQL.connect(self.dsn, serialize=0) # call ourself with the new connection curs = self.dbCursor() assert curs, "AIIEE no db cursor!!" return curs def getDBFileMeta(self, fileid): """returns an array with all meta entries of fileid""" metacache = {} # try in cache if self.REQUEST.SESSION.has_key('dbMeta'): metacache = self.REQUEST.SESSION['dbMeta'] if metacache.has_key(fileid): res = metacache[fileid] #print "meta from cache " return res curs = self.dbCursor() sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx' print sql, " -> ", fileid curs.execute(sql, {'id':fileid}) print "done" res = curs.fetchall() #print "res:", res curs.close() # store info in cache metacache[fileid] = res self.REQUEST.SESSION['dbMeta'] = metacache return res def getDBFile(self, fileid): """returns the file information of fileid""" filecache = {} # try in cache if self.REQUEST.SESSION.has_key('dbFiles'): filecache = self.REQUEST.SESSION['dbFiles'] if filecache.has_key(fileid): res = filecache[fileid] #print "file from cache " return res curs = self.dbCursor() sql = 'select filename,mtime from files where id=%(id)s' print 'DBFILE: ', sql, " -> ", fileid curs.execute(sql, {'id':fileid}) print "DBFILE: done" res = curs.fetchone() #print "DBFILE: res:", res curs.close() # store info in cache filecache[fileid] = res self.REQUEST.SESSION['dbFiles'] = filecache return res def dbSearch(self, query, type): """search DB for query and return result set""" results = [] restypes = {} if not query: # empty query return results curs = self.dbCursor() if type == 'equals': qs = query elif type == 'startswith': qs = query + "%" elif type == 'contains': qs = "%" + query + "%" sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)' print sql, " -> ", qs curs.execute(sql, {'qs':qs}) print "done" res = curs.fetchone() rescnt = 1 #print "res0:", res while res and rescnt < MAXHITS: #print "res:", res result = self.getResult(res) if result: results.append(result) restypes[result.type] = result.type res = curs.fetchone() rescnt += 1 curs.close() #self.dbCon = None #print "SEARCH: ", rescnt, " results" restypelist = restypes.keys() return (results, restypelist) def getResult(self, db_result, rank=0): """factory for result objects""" (fileid, tagidx, tags, content) = db_result res = None if tags.find('/meta/bib/') > -1: res = BibResult(self, db_result, rank) elif tags.find('/meta/archimedes/') > -1: res = ArchimResult(self, db_result, rank) else: res = AnyResult(self, db_result, rank) return res def renderResult(self, result): """returns HTML rendering of a search result""" return result.render(self) def filterResults(self, results, start, end, restypefilter=None): """returns list of results that match a filter""" # filter types first if restypefilter: res = [] for r in results: if r.type == restypefilter: res.append(r) else: res = results # new total count (because of filter) rescnt = len(res) # filter on count resgroup = res[start:end] return (resgroup, rescnt) # # Web page stuff # def index_html(self): """metadata search""" pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self) return pt() def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None): """search and create result""" sres = int(start) -1 lres = sres + count try: oldsearch = self.REQUEST.SESSION['searchstring'] oldtype = self.REQUEST.SESSION['searchtype'] except: oldsearch = "" oldtype = "" if not searchstring: searchstring = oldsearch searchtype = oldtype if not oldsearch or searchstring != oldsearch or searchtype != oldtype: # new search (res, restypes) = self.dbSearch(searchstring, searchtype) # sort the result res.sort(ranksort) # store it self.REQUEST.SESSION['results'] = res self.REQUEST.SESSION['searchstring'] = searchstring self.REQUEST.SESSION['searchtype'] = searchtype self.REQUEST.SESSION['resulttypes'] = restypes (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter) lres = min(lres, nres) sres = min(sres, nres) self.REQUEST.SESSION['resultgroup'] = resgroup self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count)) self.REQUEST.SESSION['res_type_filter'] = restypefilter if nres > 0: zpt = "zpt/searchResult.zpt" else: zpt = "zpt/searchResult_none.zpt" pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self) return pt() def getSearchType(self): """returns the last search type""" try: ret = self.REQUEST.SESSION['searchtype'] except: ret = "" return ret def getSearchString(self): """returns the last search string""" try: ret = self.REQUEST.SESSION['searchstring'] except: ret = "" return ret def hasNextResults(self): """returns if there are more results""" try: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] return (first + count < total) except: return False def hasPrevResults(self): """returns if there are previous results""" try: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] return (first > 1) except: return False def nextResults(self): """returns more results""" try: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] first = first + count last = last + count if first > total: first = total if last > total: last = total except: print "OUCH: no next results!" return self.search() return self.search(start=first, count=count) def prevResults(self): """returns more results""" try: (first, last, total, count) = self.REQUEST.SESSION['res_indexes'] first = first - count last = last - count if first < 1: first = 1 if last < 1: last = 1 except: print "OUCH: no prev results!" return self.search() return self.search(start=first, count=count) def manage_ChangeOSAS_searchForm(self): """create Search form""" pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self) return pt() def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): """add the OSAS_root""" self.id = id self.title = title self.dsn = dsn if RESPONSE is not None: RESPONSE.redirect('manage_main') def manage_AddOSAS_searchForm(self): """create Search form""" pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self) return pt() def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None): """add the OSAS_root""" newObj=OSAS_search(id,title,dsn) self._setObject(id,newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main') class SearchResult(SimpleItem): """base search result object""" def __init__(self, type='unknown', file=None, url=None, content=None, rank=0): """init""" # result type (e.g. "bib", "archim") self.type = type # index file name self.file = file # url for result (list of pairs) if url: self.urls = url else: self.urls = [] # actual content (list of tuples) self.content = content # document status (e.g. "online", "archive") self.status = None # result rank for presentation self.rank = rank class AnyResult(SearchResult): """catch-all type result object""" def __init__(self, zope, db_result, rank): """returns a catch-all type result""" SearchResult.__init__(self) #print "NEW ANY RESULT!" self.type='unknown' self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt") (db_fileid, db_tagidx, db_tags, db_content) = db_result self.hitTag = db_tags # get full info from db self.fileinfo = zope.getDBFile(db_fileid) assert self.fileinfo items = {} items[db_tags] = db_content self.content = items self.file = self.fileinfo[0] self.status = statusForFile(self.file) self.rank = rank def getContentList(self): """returns content as list of tuples in preferred order""" l = [] for k in self.content.keys(): l.append((k, self.content[k])) return l def render(self, zope): """render this result object""" zope.REQUEST.SESSION['result'] = self pt=PageTemplateFile(self.zptFile).__of__(zope) return pt() class MetaResult(AnyResult): """result object that collects metadata""" def __init__(self, zope, db_result, rank): """contructor""" AnyResult.__init__(self, zope, db_result, rank) #print "NEW META RESULT!" (fileid, tagidx, tags, content) = db_result # get full info from db self.metainfo = zope.getDBFileMeta(fileid) assert self.metainfo def checkContext(self, tags, content, ctxurl): """takes meta entry and updates url from context tags""" if tags.endswith('/context/link'): if content: #print "CTXlink: ", content ctxurl[0] = content elif tags.endswith('/context/name'): if content: #print "CTXname: ", content ctxurl[1] = content return ctxurl class BibResult(MetaResult): """bib type result object""" def __init__(self, zope, db_result, rank): """constructor""" MetaResult.__init__(self, zope, db_result, rank) #print "NEW BIB RESULT!", self self.type = "bib" self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt") url = storageURL(self.file) if url: self.urls.append(url) (fileid, tagidx, tags, content) = db_result btype = "" bitems = {} ctxurl = ['', ''] for me in self.metainfo: (m_idx, m_tags, m_content, m_attributes) = me # context tag ctxurl = self.checkContext(m_tags, m_content, ctxurl) # first tag with bib type attribute if m_tags.endswith('/meta/bib'): r = re.search('type="([^"]*)"', m_attributes) if r: btype = r.group(1) if not btype: btype = "*unknown*" bitems['type'] = btype continue # skip other tags if not btype: continue # collect bib/something r = re.search('/meta/bib/(.*)', m_tags) if r: k = r.group(1) #print "CONTENT: ", m_content bitems[k] = m_content # remember hit tag if m_tags == self.hitTag: self.hitTag = k continue self.content = bitems # store context if not ctxurl[1]: ctxurl[1] = "View" # must have link if ctxurl[0]: self.urls.append(ctxurl) self.rank += 100 def getContentList(self): """returns content as list of tuples in preferred order""" l = [] c = self.content.copy() # preferred items first for k in ('author', 'title', 'journal', 'year'): if c.has_key(k): l.append((k, c[k])) del c[k] # no type del c['type'] # copy the rest for k in c.keys(): l.append((k, c[k])) return l class ArchimResult(MetaResult): """archimedes type result object""" def __init__(self, zope, db_result, rank): """constructor""" MetaResult.__init__(self, zope, db_result, rank) #print "NEW ARCHIM RESULT!", self self.type = "archim" self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt") url = storageURL(self.file) if url: self.urls.append(url) (fileid, tagidx, tags, content) = db_result # process info bitems = {} ctxurl = ['', ''] for me in self.metainfo: (m_idx, m_tags, m_content, m_attributes) = me # context tag ctxurl = self.checkContext(m_tags, m_content, ctxurl) # collect archimedes/something r = re.search('/meta/archimedes/(.*)', m_tags) if r: k = r.group(1) #print "CONTENT: ", m_content bitems[k] = m_content # remember hit tag if m_tags == self.hitTag: self.hitTag = k continue self.content = bitems self.rank += 100 # store context if not ctxurl[1]: ctxurl[1] = "View" # must have link if ctxurl[0]: self.urls.append(ctxurl) def getContentList(self): """returns content as list of tuples in preferred order""" l = [] c = self.content.copy() # preferred items first for k in ('author', 'title', 'date', 'place'): if c.has_key(k): l.append((k, c[k])) del c[k] # copy the rest for k in c.keys(): l.append((k, c[k])) return l def ranksort(res1, res2): """sort results on rank""" return cmp(res2.rank, res1.rank) def statusForFile(filename): """heuristic... returns status for a index file name""" status = None if filename.startswith('/mpiwg/online/'): status = "online" elif filename.startswith('/mpiwg/archive/'): status = "archive" elif filename.startswith('http://'): status = "database" return status def storageURL(filename): """heuristic... returns an URL for a index file name""" url = None name = None if filename.startswith('/mpiwg/online/'): #print "URLFORFILE: online ", filename r = re.search('^(.*)/index.meta', filename) if r: url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1) name = "Storage System" elif filename.startswith('http://'): #print "URLFORFILE: url ", filename url = filename name = "Online Database" if name and url: return (url, name) return None