File:  [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.8: download - view: text, annotated - select for diffs - revision graph
Fri Jan 19 17:16:25 2007 UTC (17 years, 4 months ago) by casties
Branches: MAIN
CVS tags: HEAD
fixed PgSQL imports to work with psycopg2

"""Metadata search interface
ROC 2004, itgroup

"""

from AccessControl import ClassSecurityInfo
from Globals import InitializeClass
from Globals import Persistent, package_home
from Products.PageTemplates.PageTemplateFile import PageTemplateFile
from Products.PageTemplates.PageTemplate import PageTemplate
from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
from OFS.Folder import Folder
from OFS.SimpleItem import SimpleItem
try:
    import psycopg2 as PgSQL
except:
    try:
        import psycopg as PgSQL
    except:
        from pyPgSQL import PgSQL

import re
import os

MAXHITS = 1000

class OSAS_search(Folder):
    """Object for global metadata search"""

    meta_type="OSAS_search"

    manage_options=Folder.manage_options+(
        {'label':'Main config','action':'manage_ChangeOSAS_searchForm'},
       )
    

    def __init__(self,id,title,dsn=None):
        """init"""
        self.id=id
        self.title=title
        if dsn:
            self.dsn = dsn
        else:
            self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
        # volatile database connection object
        self._v_dbCon = None
        self._v_tryCon = 0


    def dbCursor(self):
        """returns new SQL cursor object"""
        curs = None
        if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
            try:
                curs = self._v_dbCon.cursor()
                self._v_tryCon = 0
            except:
                # in case of problems reset dbCon
                self._v_dbCon = None
                self._v_tryCon += 1
        else:
            self._v_dbCon = None
            self._v_tryCon = 0
                
        if not curs and self._v_tryCon < 3:
            self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
            # call ourself with the new connection
            curs = self.dbCursor()

        assert curs, "AIIEE no db cursor!!"
        return curs

    def getDBFileMeta(self, fileid):
        """returns an array with all meta entries of fileid"""

        metacache = {}
        # try in cache
        if self.REQUEST.SESSION.has_key('dbMeta'):
            metacache = self.REQUEST.SESSION['dbMeta']
            if metacache.has_key(fileid):
                res = metacache[fileid]
                #print "meta from cache "
                return res

        curs = self.dbCursor()

        sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
        print sql, " -> ", fileid
        curs.execute(sql, {'id':fileid})
        print "done"

        res = curs.fetchall()
        #print "res:", res
        curs.close()
        # store info in cache
        metacache[fileid] = res
        self.REQUEST.SESSION['dbMeta'] = metacache

        return res

    def getDBFile(self, fileid):
        """returns the file information of fileid"""

        filecache = {}
        # try in cache
        if self.REQUEST.SESSION.has_key('dbFiles'):
            filecache = self.REQUEST.SESSION['dbFiles']
            if filecache.has_key(fileid):
                res = filecache[fileid]
                #print "file from cache "
                return res

        curs = self.dbCursor()

        sql = 'select filename,mtime from files where id=%(id)s'
        print 'DBFILE: ', sql, " -> ", fileid
        curs.execute(sql, {'id':fileid})
        print "DBFILE: done"

        res = curs.fetchone()
        #print "DBFILE: res:", res
        curs.close()
        # store info in cache
        filecache[fileid] = res
        self.REQUEST.SESSION['dbFiles'] = filecache

        return res
	
	
    def dbSearch(self, query, type):
        """search DB for query and return result set"""
        results = []
        restypes = {}
        if not query:
            # empty query
            return results
        
        curs = self.dbCursor()
        if type == 'equals':
            qs = query
        elif type == 'startswith':
            qs = query + "%"
        elif type == 'contains':
            qs = "%" + query + "%"
            
        sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
        print sql, " -> ", qs
        curs.execute(sql, {'qs':qs})
        print "done"
        res = curs.fetchone()
        rescnt = 1
        #print "res0:", res
        while res and rescnt < MAXHITS:
            #print "res:", res
            result = self.getResult(res)
            if result:
                results.append(result)
                restypes[result.type] = result.type
                
            res = curs.fetchone()
            rescnt += 1

        curs.close()
        #self.dbCon = None

        #print "SEARCH: ", rescnt, " results"
        restypelist = restypes.keys()
        return (results, restypelist)

        
    def getResult(self, db_result, rank=0):
        """factory for result objects"""

        (fileid, tagidx, tags, content) = db_result
        res = None

        if tags.find('/meta/bib/') > -1:
            res = BibResult(self, db_result, rank)
        elif tags.find('/meta/archimedes/') > -1:
            res = ArchimResult(self, db_result, rank)
        else:
            res = AnyResult(self, db_result, rank)

        return res

	
    def renderResult(self, result):
        """returns HTML rendering of a search result"""

        return result.render(self)
	

    def filterResults(self, results, start, end, restypefilter=None):
        """returns list of results that match a filter"""
        # filter types first
        if restypefilter:
            res = []
            for r in results:
                if r.type == restypefilter:
                    res.append(r)
        else:
            res = results
	# new total count (because of filter)
        rescnt = len(res)
        # filter on count
        resgroup = res[start:end]

        return (resgroup, rescnt)
    

    #
    # Web page stuff
    #

    def index_html(self):
        """metadata search"""
        pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
        return pt()


    def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
        """search and create result"""
        sres = int(start) -1
        lres = sres + count
        try:
            oldsearch = self.REQUEST.SESSION['searchstring']
            oldtype = self.REQUEST.SESSION['searchtype']
        except:
            oldsearch = ""
            oldtype = ""
            
        if not searchstring:
            searchstring = oldsearch
            searchtype = oldtype
            
        if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
            # new search
            (res, restypes) = self.dbSearch(searchstring, searchtype)
            # sort the result
            res.sort(ranksort)
            # store it
            self.REQUEST.SESSION['results'] = res
            self.REQUEST.SESSION['searchstring'] = searchstring
            self.REQUEST.SESSION['searchtype'] = searchtype
            self.REQUEST.SESSION['resulttypes'] = restypes

        (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
        lres = min(lres, nres)
        sres = min(sres, nres)
        self.REQUEST.SESSION['resultgroup'] = resgroup
        self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
        self.REQUEST.SESSION['res_type_filter'] = restypefilter
        if nres > 0:
            zpt = "zpt/searchResult.zpt"
        else:
            zpt = "zpt/searchResult_none.zpt"
            
        pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
        return pt()


    def getSearchType(self):
        """returns the last search type"""
        try:
            ret = self.REQUEST.SESSION['searchtype']
        except:
            ret = ""

        return ret
    
    def getSearchString(self):
        """returns the last search string"""
        try:
            ret = self.REQUEST.SESSION['searchstring']
        except:
            ret = ""

        return ret
    

    def hasNextResults(self):
        """returns if there are more results"""
        try:
            (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
            return (first + count < total)
        except:
            return False

    def hasPrevResults(self):
        """returns if there are previous results"""
        try:
            (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
            return (first > 1)
        except:
            return False


    def nextResults(self):
        """returns more results"""
        try:
            (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
            first = first + count
            last = last + count
            if first > total:
                first = total
            if last > total:
                last = total
        except:
            print "OUCH: no next results!"
            return self.search()

        return self.search(start=first, count=count)

        
    def prevResults(self):
        """returns more results"""
        try:
            (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
            first = first - count
            last = last - count
            if first < 1:
                first = 1
            if last < 1:
                last = 1
        except:
            print "OUCH: no prev results!"
            return self.search()           

        return self.search(start=first, count=count)
        

    def manage_ChangeOSAS_searchForm(self):
        """create Search form"""
        pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self)
        return pt()

    def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
        """add the OSAS_root"""
        self.id = id
        self.title = title
        self.dsn = dsn
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')


def manage_AddOSAS_searchForm(self):
    """create Search form"""
    pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
    return pt()

def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
    """add the OSAS_root"""
    newObj=OSAS_search(id,title,dsn)
    self._setObject(id,newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')




class SearchResult(SimpleItem):
    """base search result object"""

    def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
        """init"""
        # result type (e.g. "bib", "archim")
        self.type = type
        # index file name
        self.file = file
        # url for result (list of pairs)
        if url:
            self.urls = url
        else:
            self.urls = []
        # actual content (list of tuples)
        self.content = content
        # document status (e.g. "online", "archive")
        self.status = None
        # result rank for presentation
        self.rank = rank

class AnyResult(SearchResult):
    """catch-all type result object"""

    def __init__(self, zope, db_result, rank):
        """returns a catch-all type result"""
        SearchResult.__init__(self)
        #print "NEW ANY RESULT!"
        self.type='unknown'
        self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
        
        (db_fileid, db_tagidx, db_tags, db_content) = db_result
        self.hitTag = db_tags

        # get full info from db
        self.fileinfo = zope.getDBFile(db_fileid)
        assert self.fileinfo

        items = {}
        items[db_tags] = db_content
        self.content = items
        self.file = self.fileinfo[0]
        self.status = statusForFile(self.file)
        self.rank = rank

    def getContentList(self):
        """returns content as list of tuples in preferred order"""
        l = []
        for k in self.content.keys():
            l.append((k, self.content[k]))

        return l

    def render(self, zope):
        """render this result object"""
        zope.REQUEST.SESSION['result'] = self
        pt=PageTemplateFile(self.zptFile).__of__(zope)
        return pt()


class MetaResult(AnyResult):
    """result object that collects metadata"""

    def __init__(self, zope, db_result, rank):
        """contructor"""
        AnyResult.__init__(self, zope, db_result, rank)
        #print "NEW META RESULT!"

        (fileid, tagidx, tags, content) = db_result

        # get full info from db
        self.metainfo = zope.getDBFileMeta(fileid)
        assert self.metainfo
        
    def checkContext(self, tags, content, ctxurl):
        """takes meta entry and updates url from context tags"""
        if tags.endswith('/context/link'):
            if content:
                #print "CTXlink: ", content
                ctxurl[0] = content
            
        elif tags.endswith('/context/name'):
            if content:
                #print "CTXname: ", content
                ctxurl[1] = content

        return ctxurl


class BibResult(MetaResult):
    """bib type result object"""

    def __init__(self, zope, db_result, rank):
        """constructor"""
        MetaResult.__init__(self, zope, db_result, rank)
        #print "NEW BIB RESULT!", self
        self.type = "bib"
        self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
        url = storageURL(self.file)
        if url:
            self.urls.append(url)
        (fileid, tagidx, tags, content) = db_result

        btype = ""
        bitems = {}
        ctxurl = ['', '']

        for me in self.metainfo:
            (m_idx, m_tags, m_content, m_attributes) = me
            # context tag
            ctxurl = self.checkContext(m_tags, m_content, ctxurl)
            # first tag with bib type attribute
            if m_tags.endswith('/meta/bib'):
                r = re.search('type="([^"]*)"', m_attributes)
                if r:
                    btype = r.group(1)

                if not btype:
                    btype = "*unknown*"

                bitems['type'] = btype
                continue

            # skip other tags
            if not btype: continue

            # collect bib/something
            r = re.search('/meta/bib/(.*)', m_tags)
            if r:
                k = r.group(1)
                #print "CONTENT: ", m_content
                bitems[k] = m_content
                # remember hit tag
                if m_tags == self.hitTag:
                    self.hitTag = k
                continue

        self.content = bitems
        # store context
        if not ctxurl[1]:
            ctxurl[1] = "View"
        # must have link
        if ctxurl[0]:
            self.urls.append(ctxurl)
                
        self.rank += 100

    def getContentList(self):
        """returns content as list of tuples in preferred order"""
        l = []
        c = self.content.copy()
        # preferred items first
        for k in ('author', 'title', 'journal', 'year'):
            if c.has_key(k):
                l.append((k, c[k]))
                del c[k]

        # no type
        del c['type']
        # copy the rest
        for k in c.keys():
            l.append((k, c[k]))

        return l


class ArchimResult(MetaResult):
    """archimedes type result object"""

    def __init__(self, zope, db_result, rank):
        """constructor"""
        MetaResult.__init__(self, zope, db_result, rank)
        #print "NEW ARCHIM RESULT!", self
        self.type = "archim"
        self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
        url = storageURL(self.file)
        if url:
            self.urls.append(url)
            
        (fileid, tagidx, tags, content) = db_result

        # process info
        bitems = {}
        ctxurl = ['', '']
        for me in self.metainfo:
            (m_idx, m_tags, m_content, m_attributes) = me
            # context tag
            ctxurl = self.checkContext(m_tags, m_content, ctxurl)
            # collect archimedes/something
            r = re.search('/meta/archimedes/(.*)', m_tags)
            if r:
                k = r.group(1)
                #print "CONTENT: ", m_content
                bitems[k] = m_content
                # remember hit tag
                if m_tags == self.hitTag:
                    self.hitTag = k
                continue

        self.content = bitems
        self.rank += 100
        # store context
        if not ctxurl[1]:
            ctxurl[1] = "View"
        # must have link
        if ctxurl[0]:
            self.urls.append(ctxurl)


    def getContentList(self):
        """returns content as list of tuples in preferred order"""
        l = []
        c = self.content.copy()
        # preferred items first
        for k in ('author', 'title', 'date', 'place'):
            if c.has_key(k):
                l.append((k, c[k]))
                del c[k]

        # copy the rest
        for k in c.keys():
            l.append((k, c[k]))

        return l
	



def ranksort(res1, res2):
    """sort results on rank"""
    return cmp(res2.rank, res1.rank)


def statusForFile(filename):
    """heuristic... returns status for a index file name"""
    status = None
    if filename.startswith('/mpiwg/online/'):
        status = "online"
    elif filename.startswith('/mpiwg/archive/'):
        status = "archive"
    elif filename.startswith('http://'):
        status = "database"
        
    return status

def storageURL(filename):
    """heuristic... returns an URL for a index file name"""
    url = None
    name = None
    if filename.startswith('/mpiwg/online/'):
        #print "URLFORFILE: online ", filename
        r = re.search('^(.*)/index.meta', filename)
        if r:
            url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
            name = "Storage System"
            
    elif filename.startswith('http://'):
        #print "URLFORFILE: url ", filename
        url = filename
        name = "Online Database"

    if name and url:
        return (url, name)
    
    return None


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>