File:  [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Fri Jul 9 16:55:19 2004 UTC (19 years, 11 months ago) by casties
Branches: MAIN
CVS tags: HEAD
more refinements

    1: """Metadata search interface
    2: ROC 2004, itgroup
    3: 
    4: """
    5: 
    6: from AccessControl import ClassSecurityInfo
    7: from Globals import InitializeClass
    8: from Globals import Persistent, package_home
    9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   10: from Products.PageTemplates.PageTemplate import PageTemplate
   11: from OFS.SimpleItem import SimpleItem
   12: #from pyPgSQL import PgSQL
   13: import psycopg as PgSQL
   14: 
   15: import re
   16: import os
   17: 
   18: MAXHITS = 1000
   19: 
   20: class OSAS_search(SimpleItem):
   21:     """Object for global metadata search"""
   22: 
   23:     meta_type="OSAS_search"
   24: 
   25:     
   26: 
   27:     def __init__(self,id,title,dsn=None):
   28:         """init"""
   29:         self.id=id
   30:         self.title=title
   31:         if dsn:
   32:             self.dsn = dsn
   33:         else:
   34:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
   35:         # volatile database connection object
   36:         self._v_dbCon = None
   37:         self._v_tryCon = 0
   38: 
   39: 
   40:     def dbCursor(self):
   41:         """returns new SQL cursor object"""
   42:         curs = None
   43:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
   44:             try:
   45:                 curs = self._v_dbCon.cursor()
   46:                 self._v_tryCon = 0
   47:             except:
   48:                 # in case of problems reset dbCon
   49:                 self._v_dbCon = None
   50:                 self._v_tryCon += 1
   51:         else:
   52:             self._v_dbCon = None
   53:             self._v_tryCon = 0
   54:                 
   55:         if not curs and self._v_tryCon < 3:
   56:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
   57:             # call ourself with the new connection
   58:             curs = self.dbCursor()
   59: 
   60:         assert curs, "AIIEE no db cursor!!"
   61:         return curs
   62: 
   63:     def getDBFileMeta(self, fileid):
   64:         """returns an array with all meta entries of fileid"""
   65: 
   66:         metacache = {}
   67:         # try in cache
   68:         if self.REQUEST.SESSION.has_key('dbMeta'):
   69:             metacache = self.REQUEST.SESSION['dbMeta']
   70:             if metacache.has_key(fileid):
   71:                 res = metacache[fileid]
   72:                 #print "meta from cache "
   73:                 return res
   74: 
   75:         curs = self.dbCursor()
   76: 
   77:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
   78:         print sql, " -> ", fileid
   79:         curs.execute(sql, {'id':fileid})
   80:         print "done"
   81: 
   82:         res = curs.fetchall()
   83:         #print "res:", res
   84:         curs.close()
   85:         # store info in cache
   86:         metacache[fileid] = res
   87:         self.REQUEST.SESSION['dbMeta'] = metacache
   88: 
   89:         return res
   90: 
   91:     def getDBFile(self, fileid):
   92:         """returns the file information of fileid"""
   93: 
   94:         filecache = {}
   95:         # try in cache
   96:         if self.REQUEST.SESSION.has_key('dbFiles'):
   97:             filecache = self.REQUEST.SESSION['dbFiles']
   98:             if filecache.has_key(fileid):
   99:                 res = filecache[fileid]
  100:                 #print "file from cache "
  101:                 return res
  102: 
  103:         curs = self.dbCursor()
  104: 
  105:         sql = 'select filename,mtime from files where id=%(id)s'
  106:         print 'DBFILE: ', sql, " -> ", fileid
  107:         curs.execute(sql, {'id':fileid})
  108:         print "DBFILE: done"
  109: 
  110:         res = curs.fetchone()
  111:         #print "DBFILE: res:", res
  112:         curs.close()
  113:         # store info in cache
  114:         filecache[fileid] = res
  115:         self.REQUEST.SESSION['dbFiles'] = filecache
  116: 
  117:         return res
  118: 	
  119: 	
  120:     def dbSearch(self, query, type):
  121:         """search DB for query and return result set"""
  122:         results = []
  123:         restypes = {}
  124:         if not query:
  125:             # empty query
  126:             return results
  127:         
  128:         curs = self.dbCursor()
  129:         if type == 'equals':
  130:             qs = query
  131:         elif type == 'startswith':
  132:             qs = query + "%"
  133:         elif type == 'contains':
  134:             qs = "%" + query + "%"
  135:             
  136:         sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
  137:         print sql, " -> ", qs
  138:         curs.execute(sql, {'qs':qs})
  139:         print "done"
  140:         res = curs.fetchone()
  141:         rescnt = 1
  142:         #print "res0:", res
  143:         while res and rescnt < MAXHITS:
  144:             #print "res:", res
  145:             result = self.getResult(res)
  146:             if result:
  147:                 results.append(result)
  148:                 restypes[result.type] = result.type
  149:                 
  150:             res = curs.fetchone()
  151:             rescnt += 1
  152: 
  153:         curs.close()
  154:         #self.dbCon = None
  155: 
  156:         #print "SEARCH: ", rescnt, " results"
  157:         restypelist = restypes.keys()
  158:         return (results, restypelist)
  159: 
  160:         
  161:     def getResult(self, db_result, rank=0):
  162:         """factory for result objects"""
  163: 
  164:         (fileid, tagidx, tags, content) = db_result
  165:         res = None
  166: 
  167:         if tags.find('/meta/bib/') > -1:
  168:             res = BibResult(self, db_result, rank)
  169:         elif tags.find('/meta/archimedes/') > -1:
  170:             res = ArchimResult(self, db_result, rank)
  171:         else:
  172:             res = AnyResult(self, db_result, rank)
  173: 
  174:         return res
  175: 
  176: 	
  177:     def renderResult(self, result):
  178:         """returns HTML rendering of a search result"""
  179: 
  180:         return result.render(self)
  181: 	
  182: 
  183:     def filterResults(self, results, start, end, restypefilter=None):
  184:         """returns list of results that match a filter"""
  185:         # filter types first
  186:         if restypefilter:
  187:             res = []
  188:             for r in results:
  189:                 if r.type in restypefilter:
  190:                     res.append(r)
  191:         else:
  192:             res = results
  193:         # filter on count
  194:         resgroup = res[start:end]
  195: 	# new total count (because of filter)
  196:         rescnt = len(res)
  197: 
  198:         return (resgroup, rescnt)
  199:     
  200: 
  201:     #
  202:     # Web page stuff
  203:     #
  204: 
  205:     def index_html(self):
  206:         """metadata search"""
  207:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
  208:         return pt()
  209: 
  210: 
  211:     def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
  212:         """search and create result"""
  213:         sres = int(start) -1
  214:         lres = sres + count
  215:         try:
  216:             oldsearch = self.REQUEST.SESSION['searchstring']
  217:             oldtype = self.REQUEST.SESSION['searchtype']
  218:         except:
  219:             oldsearch = ""
  220:             oldtype = ""
  221:             
  222:         if not searchstring:
  223:             searchstring = oldsearch
  224:             searchtype = oldtype
  225:             
  226:         if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
  227:             # new search
  228:             (res, restypes) = self.dbSearch(searchstring, searchtype)
  229:             # sort the result
  230:             res.sort(ranksort)
  231:             # store it
  232:             self.REQUEST.SESSION['results'] = res
  233:             self.REQUEST.SESSION['searchstring'] = searchstring
  234:             self.REQUEST.SESSION['searchtype'] = searchtype
  235:             self.REQUEST.SESSION['resulttypes'] = restypes
  236: 
  237:         (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
  238:         lres = min(lres, nres)
  239:         self.REQUEST.SESSION['resultgroup'] = resgroup
  240:         self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
  241:         self.REQUEST.SESSION['res_type_filter'] = restypefilter
  242:         if nres > 0:
  243:             zpt = "zpt/searchResult.zpt"
  244:         else:
  245:             zpt = "zpt/searchResult_none.zpt"
  246:             
  247:         pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
  248:         return pt()
  249: 
  250: 
  251:     def getSearchType(self):
  252:         """returns the last search type"""
  253:         try:
  254:             ret = self.REQUEST.SESSION['searchtype']
  255:         except:
  256:             ret = ""
  257: 
  258:         return ret
  259:     
  260:     def getSearchString(self):
  261:         """returns the last search string"""
  262:         try:
  263:             ret = self.REQUEST.SESSION['searchstring']
  264:         except:
  265:             ret = ""
  266: 
  267:         return ret
  268:     
  269: 
  270:     def hasNextResults(self):
  271:         """returns if there are more results"""
  272:         try:
  273:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  274:             return (first + count < total)
  275:         except:
  276:             return False
  277: 
  278:     def hasPrevResults(self):
  279:         """returns if there are previous results"""
  280:         try:
  281:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  282:             return (first > 1)
  283:         except:
  284:             return False
  285: 
  286: 
  287:     def nextResults(self):
  288:         """returns more results"""
  289:         try:
  290:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  291:             first = first + count
  292:             last = last + count
  293:             if first > total:
  294:                 first = total
  295:             if last > total:
  296:                 last = total
  297:         except:
  298:             print "OUCH: no next results!"
  299:             return self.search()
  300: 
  301:         return self.search(start=first, count=count)
  302: 
  303:         
  304:     def prevResults(self):
  305:         """returns more results"""
  306:         try:
  307:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  308:             first = first - count
  309:             last = last - count
  310:             if first < 1:
  311:                 first = 1
  312:             if last < 1:
  313:                 last = 1
  314:         except:
  315:             print "OUCH: no prev results!"
  316:             return self.search()           
  317: 
  318:         return self.search(start=first, count=count)
  319:         
  320: 
  321: 
  322: def manage_AddOSAS_searchForm(self):
  323:     """create Search form"""
  324:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
  325:     return pt()
  326: 
  327: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
  328:     """add the OSAS_root"""
  329:     newObj=OSAS_search(id,title,dsn)
  330:     self._setObject(id,newObj)
  331:     if RESPONSE is not None:
  332:         RESPONSE.redirect('manage_main')
  333: 
  334: 
  335: 
  336: 
  337: class SearchResult(SimpleItem):
  338:     """base search result object"""
  339: 
  340:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
  341:         """init"""
  342:         # result type (e.g. "bib", "archim")
  343:         self.type = type
  344:         # index file name
  345:         self.file = file
  346:         # url for result (list of pairs)
  347:         if url:
  348:             self.urls = url
  349:         else:
  350:             self.urls = []
  351:         # actual content (list of tuples)
  352:         self.content = content
  353:         # document status (e.g. "online", "archive")
  354:         self.status = None
  355:         # result rank for presentation
  356:         self.rank = rank
  357: 
  358: class AnyResult(SearchResult):
  359:     """catch-all type result object"""
  360: 
  361:     def __init__(self, zope, db_result, rank):
  362:         """returns a catch-all type result"""
  363:         SearchResult.__init__(self)
  364:         #print "NEW ANY RESULT!"
  365:         self.type='unknown'
  366:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
  367:         
  368:         (db_fileid, db_tagidx, db_tags, db_content) = db_result
  369:         self.hitTag = db_tags
  370: 
  371:         # get full info from db
  372:         self.fileinfo = zope.getDBFile(db_fileid)
  373:         assert self.fileinfo
  374: 
  375:         items = {}
  376:         items[db_tags] = db_content
  377:         self.content = items
  378:         self.file = self.fileinfo[0]
  379:         self.status = statusForFile(self.file)
  380:         self.rank = rank
  381: 
  382:     def getContentList(self):
  383:         """returns content as list of tuples in preferred order"""
  384:         l = []
  385:         for k in self.content.keys():
  386:             l.append((k, self.content[k]))
  387: 
  388:         return l
  389: 
  390:     def render(self, zope):
  391:         """render this result object"""
  392:         zope.REQUEST.SESSION['result'] = self
  393:         pt=PageTemplateFile(self.zptFile).__of__(zope)
  394:         return pt()
  395: 
  396: 
  397: class MetaResult(AnyResult):
  398:     """result object that collects metadata"""
  399: 
  400:     def __init__(self, zope, db_result, rank):
  401:         """contructor"""
  402:         AnyResult.__init__(self, zope, db_result, rank)
  403:         #print "NEW META RESULT!"
  404: 
  405:         (fileid, tagidx, tags, content) = db_result
  406: 
  407:         # get full info from db
  408:         self.metainfo = zope.getDBFileMeta(fileid)
  409:         assert self.metainfo
  410:         
  411:     def checkContext(self, tags, content, ctxurl):
  412:         """takes meta entry and updates url from context tags"""
  413:         if tags.endswith('/context/link'):
  414:             if content:
  415:                 #print "CTXlink: ", content
  416:                 ctxurl[0] = content
  417:             
  418:         elif tags.endswith('/context/name'):
  419:             if content:
  420:                 #print "CTXname: ", content
  421:                 ctxurl[1] = content
  422: 
  423:         return ctxurl
  424: 
  425: 
  426: class BibResult(MetaResult):
  427:     """bib type result object"""
  428: 
  429:     def __init__(self, zope, db_result, rank):
  430:         """constructor"""
  431:         MetaResult.__init__(self, zope, db_result, rank)
  432:         #print "NEW BIB RESULT!", self
  433:         self.type = "bib"
  434:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
  435:         url = storageURL(self.file)
  436:         if url:
  437:             self.urls.append(url)
  438:         (fileid, tagidx, tags, content) = db_result
  439: 
  440:         btype = ""
  441:         bitems = {}
  442:         ctxurl = ['', '']
  443: 
  444:         for me in self.metainfo:
  445:             (m_idx, m_tags, m_content, m_attributes) = me
  446:             # context tag
  447:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  448:             # first tag with bib type attribute
  449:             if m_tags.endswith('/meta/bib'):
  450:                 r = re.search('type="([^"]*)"', m_attributes)
  451:                 if r:
  452:                     btype = r.group(1)
  453: 
  454:                 if not btype:
  455:                     btype = "*unknown*"
  456: 
  457:                 bitems['type'] = btype
  458:                 continue
  459: 
  460:             # skip other tags
  461:             if not btype: continue
  462: 
  463:             # collect bib/something
  464:             r = re.search('/meta/bib/(.*)', m_tags)
  465:             if r:
  466:                 k = r.group(1)
  467:                 #print "CONTENT: ", m_content
  468:                 bitems[k] = m_content
  469:                 # remember hit tag
  470:                 if m_tags == self.hitTag:
  471:                     self.hitTag = k
  472:                 continue
  473: 
  474:         self.content = bitems
  475:         # store context
  476:         if not ctxurl[1]:
  477:             ctxurl[1] = "View"
  478:         # must have link
  479:         if ctxurl[0]:
  480:             self.urls.append(ctxurl)
  481:                 
  482:         self.rank += 100
  483: 
  484:     def getContentList(self):
  485:         """returns content as list of tuples in preferred order"""
  486:         l = []
  487:         c = self.content.copy()
  488:         # preferred items first
  489:         for k in ('author', 'title', 'journal', 'year'):
  490:             if c.has_key(k):
  491:                 l.append((k, c[k]))
  492:                 del c[k]
  493: 
  494:         # no type
  495:         del c['type']
  496:         # copy the rest
  497:         for k in c.keys():
  498:             l.append((k, c[k]))
  499: 
  500:         return l
  501: 
  502: 
  503: class ArchimResult(MetaResult):
  504:     """archimedes type result object"""
  505: 
  506:     def __init__(self, zope, db_result, rank):
  507:         """constructor"""
  508:         MetaResult.__init__(self, zope, db_result, rank)
  509:         #print "NEW ARCHIM RESULT!", self
  510:         self.type = "archim"
  511:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
  512:         url = storageURL(self.file)
  513:         if url:
  514:             self.urls.append(url)
  515:             
  516:         (fileid, tagidx, tags, content) = db_result
  517: 
  518:         # process info
  519:         bitems = {}
  520:         ctxurl = ['', '']
  521:         for me in self.metainfo:
  522:             (m_idx, m_tags, m_content, m_attributes) = me
  523:             # context tag
  524:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  525:             # collect archimedes/something
  526:             r = re.search('/meta/archimedes/(.*)', m_tags)
  527:             if r:
  528:                 k = r.group(1)
  529:                 #print "CONTENT: ", m_content
  530:                 bitems[k] = m_content
  531:                 # remember hit tag
  532:                 if m_tags == self.hitTag:
  533:                     self.hitTag = k
  534:                 continue
  535: 
  536:         self.content = bitems
  537:         self.rank += 100
  538:         # store context
  539:         if not ctxurl[1]:
  540:             ctxurl[1] = "View"
  541:         # must have link
  542:         if ctxurl[0]:
  543:             self.urls.append(ctxurl)
  544: 
  545: 
  546:     def getContentList(self):
  547:         """returns content as list of tuples in preferred order"""
  548:         l = []
  549:         c = self.content.copy()
  550:         # preferred items first
  551:         for k in ('author', 'title', 'date', 'place'):
  552:             if c.has_key(k):
  553:                 l.append((k, c[k]))
  554:                 del c[k]
  555: 
  556:         # copy the rest
  557:         for k in c.keys():
  558:             l.append((k, c[k]))
  559: 
  560:         return l
  561: 	
  562: 
  563: 
  564: 
  565: def ranksort(res1, res2):
  566:     """sort results on rank"""
  567:     return cmp(res2.rank, res1.rank)
  568: 
  569: 
  570: def statusForFile(filename):
  571:     """heuristic... returns status for a index file name"""
  572:     status = None
  573:     if filename.startswith('/mpiwg/online/'):
  574:         status = "online"
  575:     elif filename.startswith('/mpiwg/archive/'):
  576:         status = "archive"
  577:     elif filename.startswith('http://'):
  578:         status = "database"
  579:         
  580:     return status
  581: 
  582: def storageURL(filename):
  583:     """heuristic... returns an URL for a index file name"""
  584:     url = None
  585:     name = None
  586:     if filename.startswith('/mpiwg/online/'):
  587:         #print "URLFORFILE: online ", filename
  588:         r = re.search('^(.*)/index.meta', filename)
  589:         if r:
  590:             url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
  591:             name = "Storage System"
  592: 
  593:     if name and url:
  594:         return (url, name)
  595:     
  596:     return None
  597: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>