File:  [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Thu Jul 1 19:31:25 2004 UTC (20 years ago) by casties
Branches: MAIN
CVS tags: HEAD
first version of metadata search interface

    1: """Metadata search interface
    2: ROC 2004, itgroup
    3: 
    4: """
    5: 
    6: from AccessControl import ClassSecurityInfo
    7: from Globals import InitializeClass
    8: from Globals import Persistent, package_home
    9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   10: from Products.PageTemplates.PageTemplate import PageTemplate
   11: from OFS.SimpleItem import SimpleItem
   12: #from pyPgSQL import PgSQL
   13: import psycopg as PgSQL
   14: 
   15: import re
   16: import os
   17: 
   18: MAXHITS = 1000
   19: 
   20: class OSAS_search(SimpleItem):
   21:     """Object for global metadata search"""
   22: 
   23:     meta_type="OSAS_search"
   24: 
   25:     
   26: 
   27:     def __init__(self,id,title,dsn=None):
   28:         """init"""
   29:         self.id=id
   30:         self.title=title
   31:         if dsn:
   32:             self.dsn = dsn
   33:         else:
   34:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
   35:         # volatile database connection object
   36:         self._v_dbCon = None
   37:         self._v_tryCon = 0
   38: 
   39: 
   40:     def dbCursor(self):
   41:         """returns new SQL cursor object"""
   42:         curs = None
   43:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
   44:             try:
   45:                 curs = self._v_dbCon.cursor()
   46:                 self._v_tryCon = 0
   47:             except:
   48:                 # in case of problems reset dbCon
   49:                 self._v_dbCon = None
   50:                 self._v_tryCon += 1
   51:         else:
   52:             self._v_dbCon = None
   53:             self._v_tryCon = 0
   54:                 
   55:         if not curs and self._v_tryCon < 3:
   56:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
   57:             # call ourself with the new connection
   58:             curs = self.dbCursor()
   59: 
   60:         assert curs, "AIIEE no db cursor!!"
   61:         return curs
   62: 
   63:     def getDBFileMeta(self, fileid):
   64:         """returns an array with all meta entries of fileid"""
   65: 
   66:         metacache = {}
   67:         # try in cache
   68:         if self.REQUEST.SESSION.has_key('dbMeta'):
   69:             metacache = self.REQUEST.SESSION['dbMeta']
   70:             if metacache.has_key(fileid):
   71:                 res = metacache[fileid]
   72:                 print "meta from cache "
   73:                 return res
   74: 
   75:         curs = self.dbCursor()
   76: 
   77:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
   78:         print sql, " -> ", fileid
   79:         curs.execute(sql, {'id':fileid})
   80:         print "done"
   81: 
   82:         res = curs.fetchall()
   83:         #print "res:", res
   84:         curs.close()
   85:         # store info in cache
   86:         metacache[fileid] = res
   87:         self.REQUEST.SESSION['dbMeta'] = metacache
   88: 
   89:         return res
   90: 
   91:     def getDBFile(self, fileid):
   92:         """returns the file information of fileid"""
   93: 
   94:         filecache = {}
   95:         # try in cache
   96:         if self.REQUEST.SESSION.has_key('dbFiles'):
   97:             filecache = self.REQUEST.SESSION['dbFiles']
   98:             if filecache.has_key(fileid):
   99:                 res = filecache[fileid]
  100:                 print "file from cache "
  101:                 return res
  102: 
  103:         curs = self.dbCursor()
  104: 
  105:         sql = 'select filename,mtime from files where id=%(id)s'
  106:         print 'DBFILE: ', sql, " -> ", fileid
  107:         curs.execute(sql, {'id':fileid})
  108:         print "DBFILE: done"
  109: 
  110:         res = curs.fetchone()
  111:         #print "DBFILE: res:", res
  112:         curs.close()
  113:         # store info in cache
  114:         filecache[fileid] = res
  115:         self.REQUEST.SESSION['dbFiles'] = filecache
  116: 
  117:         return res
  118: 	
  119: 	
  120:     def dbSearch(self, query):
  121:         """search DB for query and return result set"""
  122:         curs = self.dbCursor()
  123:         qs = query + "%"
  124:         sql = 'select fileid,idx,tags,content from meta where content like %(qs)s'
  125:         print sql, " -> ", qs
  126:         curs.execute(sql, {'qs':qs})
  127:         print "done"
  128:         results = []
  129:         res = curs.fetchone()
  130:         rescnt = 1
  131:         #print "res0:", res
  132:         while res and rescnt < MAXHITS:
  133:             #print "res:", res
  134:             result = self.getResult(res)
  135:             if (result):
  136:                 results.append(result)
  137:                 
  138:             res = curs.fetchone()
  139:             rescnt += 1
  140: 
  141:         curs.close()
  142:         #self.dbCon = None
  143: 
  144:         print "SEARCH: ", rescnt, " results"
  145:         return results
  146: 
  147:         
  148:     def getResult(self, db_result, rank=0):
  149:         """factory for result objects"""
  150:         print "NEW RESULT!"
  151: 
  152:         (fileid, tagidx, tags, content) = db_result
  153:         res = None
  154: 
  155:         print "tags: ", tags
  156:         if tags.find('/meta/bib/') > -1:
  157:             res = BibResult(self, db_result, rank)
  158:         elif tags.find('/meta/archimedes/') > -1:
  159:             res = ArchimResult(self, db_result, rank)
  160:         else:
  161:             res = AnyResult(self, db_result, rank)
  162: 
  163:         return res
  164: 	
  165:     def renderResult(self, result):
  166:         """returns HTML rendering of a search result"""
  167: 
  168:         print "renderresult!", result, " -- ", result.url
  169:         return result.render(self)
  170: 	
  171: 	
  172: 	
  173: 	
  174: 
  175:     #
  176:     # Web page stuff
  177:     #
  178: 
  179:     def index_html(self):
  180:         """metadata search"""
  181:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
  182:         return pt()
  183: 
  184: 
  185:     def search(self, searchstring=None):
  186:         """search and result"""
  187:         if searchstring:
  188:             print "SEARCH: ", searchstring
  189:             res = self.dbSearch(searchstring)
  190:             res.sort(ranksort)
  191:             self.REQUEST.SESSION['results'] = res
  192:             self.REQUEST.SESSION['searchstring'] = searchstring
  193: 
  194:         print "SEARCH res:", res
  195:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/searchResult.zpt")).__of__(self)
  196:         return pt()
  197: 
  198: 	
  199:         
  200: def manage_AddOSAS_searchForm(self):
  201:     """create Search form"""
  202:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
  203:     return pt()
  204: 
  205: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
  206:     """add the OSAS_root"""
  207:     newObj=OSAS_search(id,title,dsn)
  208:     self._setObject(id,newObj)
  209:     if RESPONSE is not None:
  210:         RESPONSE.redirect('manage_main')
  211: 
  212: 
  213: 
  214: 
  215: class SearchResult(SimpleItem):
  216:     """base search result object"""
  217: 
  218:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
  219:         """init"""
  220:         self.type = type
  221:         self.file = file
  222:         self.url = url
  223:         self.urlabel = url
  224:         self.content = content
  225:         self.rank = rank
  226: 
  227: class AnyResult(SearchResult):
  228:     """catch-all type result object"""
  229: 
  230:     def __init__(self, zope, db_result, rank):
  231:         """returns a catch-all type result"""
  232:         SearchResult.__init__(self, type='unknown')
  233:         print "NEW ANY RESULT!"
  234: 
  235:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
  236:         
  237:         (fileid, tagidx, tags, content) = db_result
  238:         self.hitTag = tags
  239: 
  240:         # get full info from db
  241:         self.fileinfo = zope.getDBFile(fileid)
  242:         assert self.fileinfo
  243: 
  244:         items = {}
  245:         items[tags] = content
  246:         self.content = items
  247:         self.file = self.fileinfo[0]
  248:         self.url = ""
  249:         self.urlabel = self.file
  250:         self.rank = rank
  251: 
  252:     def render(self, zope):
  253:         """render this result object"""
  254:         zope.REQUEST.SESSION['result'] = self
  255:         print "renderender...", self
  256:         pt=PageTemplateFile(self.zptFile).__of__(zope)
  257:         return pt()
  258: 
  259: 
  260: class MetaResult(AnyResult):
  261:     """result object that collects metadata"""
  262: 
  263:     def __init__(self, zope, db_result, rank):
  264:         """contructor"""
  265:         AnyResult.__init__(self, zope, db_result, rank)
  266:         print "NEW META RESULT!"
  267: 
  268:         (fileid, tagidx, tags, content) = db_result
  269: 
  270:         # get full info from db
  271:         self.metainfo = zope.getDBFileMeta(fileid)
  272:         assert self.metainfo
  273:         
  274:     def checkContext(self, tags, content):
  275:         """takes meta entry and sets url from context tags"""
  276:         if tags.endswith('/context/link'):
  277:             if content:
  278:                 self.url = content            
  279:             
  280:         elif tags.endswith('/context/name'):
  281:             if content:
  282:                 self.urlabel = content
  283: 
  284:         else:
  285:             return False
  286: 
  287:         return True
  288: 
  289: 
  290: class BibResult(MetaResult):
  291:     """bib type result object"""
  292: 
  293:     def __init__(self, zope, db_result, rank):
  294:         """constructor"""
  295:         MetaResult.__init__(self, zope, db_result, rank)
  296:         print "NEW BIB RESULT!"
  297:         self.type = "bib"
  298:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
  299:         self.url = urlForFile(self.file)
  300:         self.urlabel = None
  301:         (fileid, tagidx, tags, content) = db_result
  302: 
  303:         btype = ""
  304:         bitems = {}
  305: 
  306:         for me in self.metainfo:
  307:             (m_idx, m_tags, m_content, m_attributes) = me
  308:             # context tag
  309:             if self.checkContext(m_tags, m_content):
  310:                 continue
  311:             # first tag with bib type attribute
  312:             if m_tags.endswith('/meta/bib'):
  313:                 r = re.search('type="([^"]*)"', m_attributes)
  314:                 if r:
  315:                     btype = r.group(1)
  316: 
  317:                 if not btype:
  318:                     btype = "*unknown*"
  319: 
  320:                 bitems['type'] = btype
  321:                 continue
  322: 
  323:             # skip other tags
  324:             if not btype: continue
  325: 
  326:             # collect bib/something
  327:             r = re.search('/meta/bib/(.*)', m_tags)
  328:             if r:
  329:                 k = r.group(1)
  330:                 #print "CONTENT: ", m_content
  331:                 bitems[k] = m_content
  332:                 continue
  333: 
  334:         self.content = bitems
  335:         self.rank += 100
  336:         if not self.urlabel and self.url:
  337:             self.urlabel = "view"
  338: 
  339: 
  340: class ArchimResult(MetaResult):
  341:     """archimedes type result object"""
  342: 
  343:     def __init__(self, zope, db_result, rank):
  344:         """constructor"""
  345:         MetaResult.__init__(self, zope, db_result, rank)
  346:         print "NEW ARCHIM RESULT!"
  347:         self.type = "archim"
  348:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
  349:         self.url = urlForFile(self.file)
  350:         self.urlabel = None
  351:         (fileid, tagidx, tags, content) = db_result
  352: 
  353:         # process info
  354:         bitems = {}
  355:         for me in self.metainfo:
  356:             (m_idx, m_tags, m_content, m_attributes) = me
  357:             # context tag
  358:             if self.checkContext(m_tags, m_content):
  359:                 continue
  360:             # collect archimedes/something
  361:             r = re.search('/meta/archimedes/(.*)', m_tags)
  362:             if r:
  363:                 k = r.group(1)
  364:                 #print "CONTENT: ", m_content
  365:                 bitems[k] = m_content
  366:                 continue
  367: 
  368:         self.content = bitems
  369:         self.rank += 100
  370:         if not self.urlabel and self.url:
  371:             self.urlabel = "view"
  372: 	
  373: 
  374: 
  375: 
  376: def ranksort(res1, res2):
  377:     """sort results on rank"""
  378:     return cmp(res2.rank, res1.rank)
  379: 
  380: 
  381: def urlForFile(filename):
  382:     """heuristic... returns an URL for a index file name"""
  383:     url = None
  384:     if filename.startswith('/mpiwg/online/'):
  385:         print "URLFORFILE: online ", filename
  386:         r = re.search('/mpiwg/online/(.*)/index.meta', filename)
  387:         if r:
  388:             url = "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary/digilib.jsp?fn=%s"%r.group(1)
  389: 
  390:     return url

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>