File:  [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.5: download - view: text, annotated - select for diffs - revision graph
Fri Jul 9 17:56:14 2004 UTC (19 years, 11 months ago) by casties
Branches: MAIN
CVS tags: HEAD
small paranoia fix

    1: """Metadata search interface
    2: ROC 2004, itgroup
    3: 
    4: """
    5: 
    6: from AccessControl import ClassSecurityInfo
    7: from Globals import InitializeClass
    8: from Globals import Persistent, package_home
    9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   10: from Products.PageTemplates.PageTemplate import PageTemplate
   11: from OFS.SimpleItem import SimpleItem
   12: #from pyPgSQL import PgSQL
   13: import psycopg as PgSQL
   14: 
   15: import re
   16: import os
   17: 
   18: MAXHITS = 1000
   19: 
   20: class OSAS_search(SimpleItem):
   21:     """Object for global metadata search"""
   22: 
   23:     meta_type="OSAS_search"
   24: 
   25:     
   26: 
   27:     def __init__(self,id,title,dsn=None):
   28:         """init"""
   29:         self.id=id
   30:         self.title=title
   31:         if dsn:
   32:             self.dsn = dsn
   33:         else:
   34:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
   35:         # volatile database connection object
   36:         self._v_dbCon = None
   37:         self._v_tryCon = 0
   38: 
   39: 
   40:     def dbCursor(self):
   41:         """returns new SQL cursor object"""
   42:         curs = None
   43:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
   44:             try:
   45:                 curs = self._v_dbCon.cursor()
   46:                 self._v_tryCon = 0
   47:             except:
   48:                 # in case of problems reset dbCon
   49:                 self._v_dbCon = None
   50:                 self._v_tryCon += 1
   51:         else:
   52:             self._v_dbCon = None
   53:             self._v_tryCon = 0
   54:                 
   55:         if not curs and self._v_tryCon < 3:
   56:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
   57:             # call ourself with the new connection
   58:             curs = self.dbCursor()
   59: 
   60:         assert curs, "AIIEE no db cursor!!"
   61:         return curs
   62: 
   63:     def getDBFileMeta(self, fileid):
   64:         """returns an array with all meta entries of fileid"""
   65: 
   66:         metacache = {}
   67:         # try in cache
   68:         if self.REQUEST.SESSION.has_key('dbMeta'):
   69:             metacache = self.REQUEST.SESSION['dbMeta']
   70:             if metacache.has_key(fileid):
   71:                 res = metacache[fileid]
   72:                 #print "meta from cache "
   73:                 return res
   74: 
   75:         curs = self.dbCursor()
   76: 
   77:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
   78:         print sql, " -> ", fileid
   79:         curs.execute(sql, {'id':fileid})
   80:         print "done"
   81: 
   82:         res = curs.fetchall()
   83:         #print "res:", res
   84:         curs.close()
   85:         # store info in cache
   86:         metacache[fileid] = res
   87:         self.REQUEST.SESSION['dbMeta'] = metacache
   88: 
   89:         return res
   90: 
   91:     def getDBFile(self, fileid):
   92:         """returns the file information of fileid"""
   93: 
   94:         filecache = {}
   95:         # try in cache
   96:         if self.REQUEST.SESSION.has_key('dbFiles'):
   97:             filecache = self.REQUEST.SESSION['dbFiles']
   98:             if filecache.has_key(fileid):
   99:                 res = filecache[fileid]
  100:                 #print "file from cache "
  101:                 return res
  102: 
  103:         curs = self.dbCursor()
  104: 
  105:         sql = 'select filename,mtime from files where id=%(id)s'
  106:         print 'DBFILE: ', sql, " -> ", fileid
  107:         curs.execute(sql, {'id':fileid})
  108:         print "DBFILE: done"
  109: 
  110:         res = curs.fetchone()
  111:         #print "DBFILE: res:", res
  112:         curs.close()
  113:         # store info in cache
  114:         filecache[fileid] = res
  115:         self.REQUEST.SESSION['dbFiles'] = filecache
  116: 
  117:         return res
  118: 	
  119: 	
  120:     def dbSearch(self, query, type):
  121:         """search DB for query and return result set"""
  122:         results = []
  123:         restypes = {}
  124:         if not query:
  125:             # empty query
  126:             return results
  127:         
  128:         curs = self.dbCursor()
  129:         if type == 'equals':
  130:             qs = query
  131:         elif type == 'startswith':
  132:             qs = query + "%"
  133:         elif type == 'contains':
  134:             qs = "%" + query + "%"
  135:             
  136:         sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
  137:         print sql, " -> ", qs
  138:         curs.execute(sql, {'qs':qs})
  139:         print "done"
  140:         res = curs.fetchone()
  141:         rescnt = 1
  142:         #print "res0:", res
  143:         while res and rescnt < MAXHITS:
  144:             #print "res:", res
  145:             result = self.getResult(res)
  146:             if result:
  147:                 results.append(result)
  148:                 restypes[result.type] = result.type
  149:                 
  150:             res = curs.fetchone()
  151:             rescnt += 1
  152: 
  153:         curs.close()
  154:         #self.dbCon = None
  155: 
  156:         #print "SEARCH: ", rescnt, " results"
  157:         restypelist = restypes.keys()
  158:         return (results, restypelist)
  159: 
  160:         
  161:     def getResult(self, db_result, rank=0):
  162:         """factory for result objects"""
  163: 
  164:         (fileid, tagidx, tags, content) = db_result
  165:         res = None
  166: 
  167:         if tags.find('/meta/bib/') > -1:
  168:             res = BibResult(self, db_result, rank)
  169:         elif tags.find('/meta/archimedes/') > -1:
  170:             res = ArchimResult(self, db_result, rank)
  171:         else:
  172:             res = AnyResult(self, db_result, rank)
  173: 
  174:         return res
  175: 
  176: 	
  177:     def renderResult(self, result):
  178:         """returns HTML rendering of a search result"""
  179: 
  180:         return result.render(self)
  181: 	
  182: 
  183:     def filterResults(self, results, start, end, restypefilter=None):
  184:         """returns list of results that match a filter"""
  185:         # filter types first
  186:         if restypefilter:
  187:             res = []
  188:             for r in results:
  189:                 if r.type in restypefilter:
  190:                     res.append(r)
  191:         else:
  192:             res = results
  193: 	# new total count (because of filter)
  194:         rescnt = len(res)
  195:         # filter on count
  196:         resgroup = res[start:end]
  197: 
  198:         return (resgroup, rescnt)
  199:     
  200: 
  201:     #
  202:     # Web page stuff
  203:     #
  204: 
  205:     def index_html(self):
  206:         """metadata search"""
  207:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
  208:         return pt()
  209: 
  210: 
  211:     def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
  212:         """search and create result"""
  213:         sres = int(start) -1
  214:         lres = sres + count
  215:         try:
  216:             oldsearch = self.REQUEST.SESSION['searchstring']
  217:             oldtype = self.REQUEST.SESSION['searchtype']
  218:         except:
  219:             oldsearch = ""
  220:             oldtype = ""
  221:             
  222:         if not searchstring:
  223:             searchstring = oldsearch
  224:             searchtype = oldtype
  225:             
  226:         if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
  227:             # new search
  228:             (res, restypes) = self.dbSearch(searchstring, searchtype)
  229:             # sort the result
  230:             res.sort(ranksort)
  231:             # store it
  232:             self.REQUEST.SESSION['results'] = res
  233:             self.REQUEST.SESSION['searchstring'] = searchstring
  234:             self.REQUEST.SESSION['searchtype'] = searchtype
  235:             self.REQUEST.SESSION['resulttypes'] = restypes
  236: 
  237:         (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
  238:         lres = min(lres, nres)
  239:         sres = min(sres, nres)
  240:         self.REQUEST.SESSION['resultgroup'] = resgroup
  241:         self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
  242:         self.REQUEST.SESSION['res_type_filter'] = restypefilter
  243:         if nres > 0:
  244:             zpt = "zpt/searchResult.zpt"
  245:         else:
  246:             zpt = "zpt/searchResult_none.zpt"
  247:             
  248:         pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
  249:         return pt()
  250: 
  251: 
  252:     def getSearchType(self):
  253:         """returns the last search type"""
  254:         try:
  255:             ret = self.REQUEST.SESSION['searchtype']
  256:         except:
  257:             ret = ""
  258: 
  259:         return ret
  260:     
  261:     def getSearchString(self):
  262:         """returns the last search string"""
  263:         try:
  264:             ret = self.REQUEST.SESSION['searchstring']
  265:         except:
  266:             ret = ""
  267: 
  268:         return ret
  269:     
  270: 
  271:     def hasNextResults(self):
  272:         """returns if there are more results"""
  273:         try:
  274:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  275:             return (first + count < total)
  276:         except:
  277:             return False
  278: 
  279:     def hasPrevResults(self):
  280:         """returns if there are previous results"""
  281:         try:
  282:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  283:             return (first > 1)
  284:         except:
  285:             return False
  286: 
  287: 
  288:     def nextResults(self):
  289:         """returns more results"""
  290:         try:
  291:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  292:             first = first + count
  293:             last = last + count
  294:             if first > total:
  295:                 first = total
  296:             if last > total:
  297:                 last = total
  298:         except:
  299:             print "OUCH: no next results!"
  300:             return self.search()
  301: 
  302:         return self.search(start=first, count=count)
  303: 
  304:         
  305:     def prevResults(self):
  306:         """returns more results"""
  307:         try:
  308:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  309:             first = first - count
  310:             last = last - count
  311:             if first < 1:
  312:                 first = 1
  313:             if last < 1:
  314:                 last = 1
  315:         except:
  316:             print "OUCH: no prev results!"
  317:             return self.search()           
  318: 
  319:         return self.search(start=first, count=count)
  320:         
  321: 
  322: 
  323: def manage_AddOSAS_searchForm(self):
  324:     """create Search form"""
  325:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
  326:     return pt()
  327: 
  328: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
  329:     """add the OSAS_root"""
  330:     newObj=OSAS_search(id,title,dsn)
  331:     self._setObject(id,newObj)
  332:     if RESPONSE is not None:
  333:         RESPONSE.redirect('manage_main')
  334: 
  335: 
  336: 
  337: 
  338: class SearchResult(SimpleItem):
  339:     """base search result object"""
  340: 
  341:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
  342:         """init"""
  343:         # result type (e.g. "bib", "archim")
  344:         self.type = type
  345:         # index file name
  346:         self.file = file
  347:         # url for result (list of pairs)
  348:         if url:
  349:             self.urls = url
  350:         else:
  351:             self.urls = []
  352:         # actual content (list of tuples)
  353:         self.content = content
  354:         # document status (e.g. "online", "archive")
  355:         self.status = None
  356:         # result rank for presentation
  357:         self.rank = rank
  358: 
  359: class AnyResult(SearchResult):
  360:     """catch-all type result object"""
  361: 
  362:     def __init__(self, zope, db_result, rank):
  363:         """returns a catch-all type result"""
  364:         SearchResult.__init__(self)
  365:         #print "NEW ANY RESULT!"
  366:         self.type='unknown'
  367:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
  368:         
  369:         (db_fileid, db_tagidx, db_tags, db_content) = db_result
  370:         self.hitTag = db_tags
  371: 
  372:         # get full info from db
  373:         self.fileinfo = zope.getDBFile(db_fileid)
  374:         assert self.fileinfo
  375: 
  376:         items = {}
  377:         items[db_tags] = db_content
  378:         self.content = items
  379:         self.file = self.fileinfo[0]
  380:         self.status = statusForFile(self.file)
  381:         self.rank = rank
  382: 
  383:     def getContentList(self):
  384:         """returns content as list of tuples in preferred order"""
  385:         l = []
  386:         for k in self.content.keys():
  387:             l.append((k, self.content[k]))
  388: 
  389:         return l
  390: 
  391:     def render(self, zope):
  392:         """render this result object"""
  393:         zope.REQUEST.SESSION['result'] = self
  394:         pt=PageTemplateFile(self.zptFile).__of__(zope)
  395:         return pt()
  396: 
  397: 
  398: class MetaResult(AnyResult):
  399:     """result object that collects metadata"""
  400: 
  401:     def __init__(self, zope, db_result, rank):
  402:         """contructor"""
  403:         AnyResult.__init__(self, zope, db_result, rank)
  404:         #print "NEW META RESULT!"
  405: 
  406:         (fileid, tagidx, tags, content) = db_result
  407: 
  408:         # get full info from db
  409:         self.metainfo = zope.getDBFileMeta(fileid)
  410:         assert self.metainfo
  411:         
  412:     def checkContext(self, tags, content, ctxurl):
  413:         """takes meta entry and updates url from context tags"""
  414:         if tags.endswith('/context/link'):
  415:             if content:
  416:                 #print "CTXlink: ", content
  417:                 ctxurl[0] = content
  418:             
  419:         elif tags.endswith('/context/name'):
  420:             if content:
  421:                 #print "CTXname: ", content
  422:                 ctxurl[1] = content
  423: 
  424:         return ctxurl
  425: 
  426: 
  427: class BibResult(MetaResult):
  428:     """bib type result object"""
  429: 
  430:     def __init__(self, zope, db_result, rank):
  431:         """constructor"""
  432:         MetaResult.__init__(self, zope, db_result, rank)
  433:         #print "NEW BIB RESULT!", self
  434:         self.type = "bib"
  435:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
  436:         url = storageURL(self.file)
  437:         if url:
  438:             self.urls.append(url)
  439:         (fileid, tagidx, tags, content) = db_result
  440: 
  441:         btype = ""
  442:         bitems = {}
  443:         ctxurl = ['', '']
  444: 
  445:         for me in self.metainfo:
  446:             (m_idx, m_tags, m_content, m_attributes) = me
  447:             # context tag
  448:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  449:             # first tag with bib type attribute
  450:             if m_tags.endswith('/meta/bib'):
  451:                 r = re.search('type="([^"]*)"', m_attributes)
  452:                 if r:
  453:                     btype = r.group(1)
  454: 
  455:                 if not btype:
  456:                     btype = "*unknown*"
  457: 
  458:                 bitems['type'] = btype
  459:                 continue
  460: 
  461:             # skip other tags
  462:             if not btype: continue
  463: 
  464:             # collect bib/something
  465:             r = re.search('/meta/bib/(.*)', m_tags)
  466:             if r:
  467:                 k = r.group(1)
  468:                 #print "CONTENT: ", m_content
  469:                 bitems[k] = m_content
  470:                 # remember hit tag
  471:                 if m_tags == self.hitTag:
  472:                     self.hitTag = k
  473:                 continue
  474: 
  475:         self.content = bitems
  476:         # store context
  477:         if not ctxurl[1]:
  478:             ctxurl[1] = "View"
  479:         # must have link
  480:         if ctxurl[0]:
  481:             self.urls.append(ctxurl)
  482:                 
  483:         self.rank += 100
  484: 
  485:     def getContentList(self):
  486:         """returns content as list of tuples in preferred order"""
  487:         l = []
  488:         c = self.content.copy()
  489:         # preferred items first
  490:         for k in ('author', 'title', 'journal', 'year'):
  491:             if c.has_key(k):
  492:                 l.append((k, c[k]))
  493:                 del c[k]
  494: 
  495:         # no type
  496:         del c['type']
  497:         # copy the rest
  498:         for k in c.keys():
  499:             l.append((k, c[k]))
  500: 
  501:         return l
  502: 
  503: 
  504: class ArchimResult(MetaResult):
  505:     """archimedes type result object"""
  506: 
  507:     def __init__(self, zope, db_result, rank):
  508:         """constructor"""
  509:         MetaResult.__init__(self, zope, db_result, rank)
  510:         #print "NEW ARCHIM RESULT!", self
  511:         self.type = "archim"
  512:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
  513:         url = storageURL(self.file)
  514:         if url:
  515:             self.urls.append(url)
  516:             
  517:         (fileid, tagidx, tags, content) = db_result
  518: 
  519:         # process info
  520:         bitems = {}
  521:         ctxurl = ['', '']
  522:         for me in self.metainfo:
  523:             (m_idx, m_tags, m_content, m_attributes) = me
  524:             # context tag
  525:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  526:             # collect archimedes/something
  527:             r = re.search('/meta/archimedes/(.*)', m_tags)
  528:             if r:
  529:                 k = r.group(1)
  530:                 #print "CONTENT: ", m_content
  531:                 bitems[k] = m_content
  532:                 # remember hit tag
  533:                 if m_tags == self.hitTag:
  534:                     self.hitTag = k
  535:                 continue
  536: 
  537:         self.content = bitems
  538:         self.rank += 100
  539:         # store context
  540:         if not ctxurl[1]:
  541:             ctxurl[1] = "View"
  542:         # must have link
  543:         if ctxurl[0]:
  544:             self.urls.append(ctxurl)
  545: 
  546: 
  547:     def getContentList(self):
  548:         """returns content as list of tuples in preferred order"""
  549:         l = []
  550:         c = self.content.copy()
  551:         # preferred items first
  552:         for k in ('author', 'title', 'date', 'place'):
  553:             if c.has_key(k):
  554:                 l.append((k, c[k]))
  555:                 del c[k]
  556: 
  557:         # copy the rest
  558:         for k in c.keys():
  559:             l.append((k, c[k]))
  560: 
  561:         return l
  562: 	
  563: 
  564: 
  565: 
  566: def ranksort(res1, res2):
  567:     """sort results on rank"""
  568:     return cmp(res2.rank, res1.rank)
  569: 
  570: 
  571: def statusForFile(filename):
  572:     """heuristic... returns status for a index file name"""
  573:     status = None
  574:     if filename.startswith('/mpiwg/online/'):
  575:         status = "online"
  576:     elif filename.startswith('/mpiwg/archive/'):
  577:         status = "archive"
  578:     elif filename.startswith('http://'):
  579:         status = "database"
  580:         
  581:     return status
  582: 
  583: def storageURL(filename):
  584:     """heuristic... returns an URL for a index file name"""
  585:     url = None
  586:     name = None
  587:     if filename.startswith('/mpiwg/online/'):
  588:         #print "URLFORFILE: online ", filename
  589:         r = re.search('^(.*)/index.meta', filename)
  590:         if r:
  591:             url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
  592:             name = "Storage System"
  593:             
  594:     elif filename.startswith('http://'):
  595:         #print "URLFORFILE: url ", filename
  596:         url = filename
  597:         name = "Online Database"
  598: 
  599:     if name and url:
  600:         return (url, name)
  601:     
  602:     return None
  603: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>