File:  [Repository] / OSAS / OSA_system / OSAS_search.py
Revision 1.8: download - view: text, annotated - select for diffs - revision graph
Fri Jan 19 17:16:25 2007 UTC (17 years, 5 months ago) by casties
Branches: MAIN
CVS tags: HEAD
fixed PgSQL imports to work with psycopg2

    1: """Metadata search interface
    2: ROC 2004, itgroup
    3: 
    4: """
    5: 
    6: from AccessControl import ClassSecurityInfo
    7: from Globals import InitializeClass
    8: from Globals import Persistent, package_home
    9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   10: from Products.PageTemplates.PageTemplate import PageTemplate
   11: from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
   12: from OFS.Folder import Folder
   13: from OFS.SimpleItem import SimpleItem
   14: try:
   15:     import psycopg2 as PgSQL
   16: except:
   17:     try:
   18:         import psycopg as PgSQL
   19:     except:
   20:         from pyPgSQL import PgSQL
   21: 
   22: import re
   23: import os
   24: 
   25: MAXHITS = 1000
   26: 
   27: class OSAS_search(Folder):
   28:     """Object for global metadata search"""
   29: 
   30:     meta_type="OSAS_search"
   31: 
   32:     manage_options=Folder.manage_options+(
   33:         {'label':'Main config','action':'manage_ChangeOSAS_searchForm'},
   34:        )
   35:     
   36: 
   37:     def __init__(self,id,title,dsn=None):
   38:         """init"""
   39:         self.id=id
   40:         self.title=title
   41:         if dsn:
   42:             self.dsn = dsn
   43:         else:
   44:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
   45:         # volatile database connection object
   46:         self._v_dbCon = None
   47:         self._v_tryCon = 0
   48: 
   49: 
   50:     def dbCursor(self):
   51:         """returns new SQL cursor object"""
   52:         curs = None
   53:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
   54:             try:
   55:                 curs = self._v_dbCon.cursor()
   56:                 self._v_tryCon = 0
   57:             except:
   58:                 # in case of problems reset dbCon
   59:                 self._v_dbCon = None
   60:                 self._v_tryCon += 1
   61:         else:
   62:             self._v_dbCon = None
   63:             self._v_tryCon = 0
   64:                 
   65:         if not curs and self._v_tryCon < 3:
   66:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
   67:             # call ourself with the new connection
   68:             curs = self.dbCursor()
   69: 
   70:         assert curs, "AIIEE no db cursor!!"
   71:         return curs
   72: 
   73:     def getDBFileMeta(self, fileid):
   74:         """returns an array with all meta entries of fileid"""
   75: 
   76:         metacache = {}
   77:         # try in cache
   78:         if self.REQUEST.SESSION.has_key('dbMeta'):
   79:             metacache = self.REQUEST.SESSION['dbMeta']
   80:             if metacache.has_key(fileid):
   81:                 res = metacache[fileid]
   82:                 #print "meta from cache "
   83:                 return res
   84: 
   85:         curs = self.dbCursor()
   86: 
   87:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
   88:         print sql, " -> ", fileid
   89:         curs.execute(sql, {'id':fileid})
   90:         print "done"
   91: 
   92:         res = curs.fetchall()
   93:         #print "res:", res
   94:         curs.close()
   95:         # store info in cache
   96:         metacache[fileid] = res
   97:         self.REQUEST.SESSION['dbMeta'] = metacache
   98: 
   99:         return res
  100: 
  101:     def getDBFile(self, fileid):
  102:         """returns the file information of fileid"""
  103: 
  104:         filecache = {}
  105:         # try in cache
  106:         if self.REQUEST.SESSION.has_key('dbFiles'):
  107:             filecache = self.REQUEST.SESSION['dbFiles']
  108:             if filecache.has_key(fileid):
  109:                 res = filecache[fileid]
  110:                 #print "file from cache "
  111:                 return res
  112: 
  113:         curs = self.dbCursor()
  114: 
  115:         sql = 'select filename,mtime from files where id=%(id)s'
  116:         print 'DBFILE: ', sql, " -> ", fileid
  117:         curs.execute(sql, {'id':fileid})
  118:         print "DBFILE: done"
  119: 
  120:         res = curs.fetchone()
  121:         #print "DBFILE: res:", res
  122:         curs.close()
  123:         # store info in cache
  124:         filecache[fileid] = res
  125:         self.REQUEST.SESSION['dbFiles'] = filecache
  126: 
  127:         return res
  128: 	
  129: 	
  130:     def dbSearch(self, query, type):
  131:         """search DB for query and return result set"""
  132:         results = []
  133:         restypes = {}
  134:         if not query:
  135:             # empty query
  136:             return results
  137:         
  138:         curs = self.dbCursor()
  139:         if type == 'equals':
  140:             qs = query
  141:         elif type == 'startswith':
  142:             qs = query + "%"
  143:         elif type == 'contains':
  144:             qs = "%" + query + "%"
  145:             
  146:         sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
  147:         print sql, " -> ", qs
  148:         curs.execute(sql, {'qs':qs})
  149:         print "done"
  150:         res = curs.fetchone()
  151:         rescnt = 1
  152:         #print "res0:", res
  153:         while res and rescnt < MAXHITS:
  154:             #print "res:", res
  155:             result = self.getResult(res)
  156:             if result:
  157:                 results.append(result)
  158:                 restypes[result.type] = result.type
  159:                 
  160:             res = curs.fetchone()
  161:             rescnt += 1
  162: 
  163:         curs.close()
  164:         #self.dbCon = None
  165: 
  166:         #print "SEARCH: ", rescnt, " results"
  167:         restypelist = restypes.keys()
  168:         return (results, restypelist)
  169: 
  170:         
  171:     def getResult(self, db_result, rank=0):
  172:         """factory for result objects"""
  173: 
  174:         (fileid, tagidx, tags, content) = db_result
  175:         res = None
  176: 
  177:         if tags.find('/meta/bib/') > -1:
  178:             res = BibResult(self, db_result, rank)
  179:         elif tags.find('/meta/archimedes/') > -1:
  180:             res = ArchimResult(self, db_result, rank)
  181:         else:
  182:             res = AnyResult(self, db_result, rank)
  183: 
  184:         return res
  185: 
  186: 	
  187:     def renderResult(self, result):
  188:         """returns HTML rendering of a search result"""
  189: 
  190:         return result.render(self)
  191: 	
  192: 
  193:     def filterResults(self, results, start, end, restypefilter=None):
  194:         """returns list of results that match a filter"""
  195:         # filter types first
  196:         if restypefilter:
  197:             res = []
  198:             for r in results:
  199:                 if r.type == restypefilter:
  200:                     res.append(r)
  201:         else:
  202:             res = results
  203: 	# new total count (because of filter)
  204:         rescnt = len(res)
  205:         # filter on count
  206:         resgroup = res[start:end]
  207: 
  208:         return (resgroup, rescnt)
  209:     
  210: 
  211:     #
  212:     # Web page stuff
  213:     #
  214: 
  215:     def index_html(self):
  216:         """metadata search"""
  217:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
  218:         return pt()
  219: 
  220: 
  221:     def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
  222:         """search and create result"""
  223:         sres = int(start) -1
  224:         lres = sres + count
  225:         try:
  226:             oldsearch = self.REQUEST.SESSION['searchstring']
  227:             oldtype = self.REQUEST.SESSION['searchtype']
  228:         except:
  229:             oldsearch = ""
  230:             oldtype = ""
  231:             
  232:         if not searchstring:
  233:             searchstring = oldsearch
  234:             searchtype = oldtype
  235:             
  236:         if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
  237:             # new search
  238:             (res, restypes) = self.dbSearch(searchstring, searchtype)
  239:             # sort the result
  240:             res.sort(ranksort)
  241:             # store it
  242:             self.REQUEST.SESSION['results'] = res
  243:             self.REQUEST.SESSION['searchstring'] = searchstring
  244:             self.REQUEST.SESSION['searchtype'] = searchtype
  245:             self.REQUEST.SESSION['resulttypes'] = restypes
  246: 
  247:         (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
  248:         lres = min(lres, nres)
  249:         sres = min(sres, nres)
  250:         self.REQUEST.SESSION['resultgroup'] = resgroup
  251:         self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
  252:         self.REQUEST.SESSION['res_type_filter'] = restypefilter
  253:         if nres > 0:
  254:             zpt = "zpt/searchResult.zpt"
  255:         else:
  256:             zpt = "zpt/searchResult_none.zpt"
  257:             
  258:         pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
  259:         return pt()
  260: 
  261: 
  262:     def getSearchType(self):
  263:         """returns the last search type"""
  264:         try:
  265:             ret = self.REQUEST.SESSION['searchtype']
  266:         except:
  267:             ret = ""
  268: 
  269:         return ret
  270:     
  271:     def getSearchString(self):
  272:         """returns the last search string"""
  273:         try:
  274:             ret = self.REQUEST.SESSION['searchstring']
  275:         except:
  276:             ret = ""
  277: 
  278:         return ret
  279:     
  280: 
  281:     def hasNextResults(self):
  282:         """returns if there are more results"""
  283:         try:
  284:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  285:             return (first + count < total)
  286:         except:
  287:             return False
  288: 
  289:     def hasPrevResults(self):
  290:         """returns if there are previous results"""
  291:         try:
  292:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  293:             return (first > 1)
  294:         except:
  295:             return False
  296: 
  297: 
  298:     def nextResults(self):
  299:         """returns more results"""
  300:         try:
  301:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  302:             first = first + count
  303:             last = last + count
  304:             if first > total:
  305:                 first = total
  306:             if last > total:
  307:                 last = total
  308:         except:
  309:             print "OUCH: no next results!"
  310:             return self.search()
  311: 
  312:         return self.search(start=first, count=count)
  313: 
  314:         
  315:     def prevResults(self):
  316:         """returns more results"""
  317:         try:
  318:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
  319:             first = first - count
  320:             last = last - count
  321:             if first < 1:
  322:                 first = 1
  323:             if last < 1:
  324:                 last = 1
  325:         except:
  326:             print "OUCH: no prev results!"
  327:             return self.search()           
  328: 
  329:         return self.search(start=first, count=count)
  330:         
  331: 
  332:     def manage_ChangeOSAS_searchForm(self):
  333:         """create Search form"""
  334:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self)
  335:         return pt()
  336: 
  337:     def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
  338:         """add the OSAS_root"""
  339:         self.id = id
  340:         self.title = title
  341:         self.dsn = dsn
  342:         if RESPONSE is not None:
  343:             RESPONSE.redirect('manage_main')
  344: 
  345: 
  346: def manage_AddOSAS_searchForm(self):
  347:     """create Search form"""
  348:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
  349:     return pt()
  350: 
  351: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
  352:     """add the OSAS_root"""
  353:     newObj=OSAS_search(id,title,dsn)
  354:     self._setObject(id,newObj)
  355:     if RESPONSE is not None:
  356:         RESPONSE.redirect('manage_main')
  357: 
  358: 
  359: 
  360: 
  361: class SearchResult(SimpleItem):
  362:     """base search result object"""
  363: 
  364:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
  365:         """init"""
  366:         # result type (e.g. "bib", "archim")
  367:         self.type = type
  368:         # index file name
  369:         self.file = file
  370:         # url for result (list of pairs)
  371:         if url:
  372:             self.urls = url
  373:         else:
  374:             self.urls = []
  375:         # actual content (list of tuples)
  376:         self.content = content
  377:         # document status (e.g. "online", "archive")
  378:         self.status = None
  379:         # result rank for presentation
  380:         self.rank = rank
  381: 
  382: class AnyResult(SearchResult):
  383:     """catch-all type result object"""
  384: 
  385:     def __init__(self, zope, db_result, rank):
  386:         """returns a catch-all type result"""
  387:         SearchResult.__init__(self)
  388:         #print "NEW ANY RESULT!"
  389:         self.type='unknown'
  390:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
  391:         
  392:         (db_fileid, db_tagidx, db_tags, db_content) = db_result
  393:         self.hitTag = db_tags
  394: 
  395:         # get full info from db
  396:         self.fileinfo = zope.getDBFile(db_fileid)
  397:         assert self.fileinfo
  398: 
  399:         items = {}
  400:         items[db_tags] = db_content
  401:         self.content = items
  402:         self.file = self.fileinfo[0]
  403:         self.status = statusForFile(self.file)
  404:         self.rank = rank
  405: 
  406:     def getContentList(self):
  407:         """returns content as list of tuples in preferred order"""
  408:         l = []
  409:         for k in self.content.keys():
  410:             l.append((k, self.content[k]))
  411: 
  412:         return l
  413: 
  414:     def render(self, zope):
  415:         """render this result object"""
  416:         zope.REQUEST.SESSION['result'] = self
  417:         pt=PageTemplateFile(self.zptFile).__of__(zope)
  418:         return pt()
  419: 
  420: 
  421: class MetaResult(AnyResult):
  422:     """result object that collects metadata"""
  423: 
  424:     def __init__(self, zope, db_result, rank):
  425:         """contructor"""
  426:         AnyResult.__init__(self, zope, db_result, rank)
  427:         #print "NEW META RESULT!"
  428: 
  429:         (fileid, tagidx, tags, content) = db_result
  430: 
  431:         # get full info from db
  432:         self.metainfo = zope.getDBFileMeta(fileid)
  433:         assert self.metainfo
  434:         
  435:     def checkContext(self, tags, content, ctxurl):
  436:         """takes meta entry and updates url from context tags"""
  437:         if tags.endswith('/context/link'):
  438:             if content:
  439:                 #print "CTXlink: ", content
  440:                 ctxurl[0] = content
  441:             
  442:         elif tags.endswith('/context/name'):
  443:             if content:
  444:                 #print "CTXname: ", content
  445:                 ctxurl[1] = content
  446: 
  447:         return ctxurl
  448: 
  449: 
  450: class BibResult(MetaResult):
  451:     """bib type result object"""
  452: 
  453:     def __init__(self, zope, db_result, rank):
  454:         """constructor"""
  455:         MetaResult.__init__(self, zope, db_result, rank)
  456:         #print "NEW BIB RESULT!", self
  457:         self.type = "bib"
  458:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
  459:         url = storageURL(self.file)
  460:         if url:
  461:             self.urls.append(url)
  462:         (fileid, tagidx, tags, content) = db_result
  463: 
  464:         btype = ""
  465:         bitems = {}
  466:         ctxurl = ['', '']
  467: 
  468:         for me in self.metainfo:
  469:             (m_idx, m_tags, m_content, m_attributes) = me
  470:             # context tag
  471:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  472:             # first tag with bib type attribute
  473:             if m_tags.endswith('/meta/bib'):
  474:                 r = re.search('type="([^"]*)"', m_attributes)
  475:                 if r:
  476:                     btype = r.group(1)
  477: 
  478:                 if not btype:
  479:                     btype = "*unknown*"
  480: 
  481:                 bitems['type'] = btype
  482:                 continue
  483: 
  484:             # skip other tags
  485:             if not btype: continue
  486: 
  487:             # collect bib/something
  488:             r = re.search('/meta/bib/(.*)', m_tags)
  489:             if r:
  490:                 k = r.group(1)
  491:                 #print "CONTENT: ", m_content
  492:                 bitems[k] = m_content
  493:                 # remember hit tag
  494:                 if m_tags == self.hitTag:
  495:                     self.hitTag = k
  496:                 continue
  497: 
  498:         self.content = bitems
  499:         # store context
  500:         if not ctxurl[1]:
  501:             ctxurl[1] = "View"
  502:         # must have link
  503:         if ctxurl[0]:
  504:             self.urls.append(ctxurl)
  505:                 
  506:         self.rank += 100
  507: 
  508:     def getContentList(self):
  509:         """returns content as list of tuples in preferred order"""
  510:         l = []
  511:         c = self.content.copy()
  512:         # preferred items first
  513:         for k in ('author', 'title', 'journal', 'year'):
  514:             if c.has_key(k):
  515:                 l.append((k, c[k]))
  516:                 del c[k]
  517: 
  518:         # no type
  519:         del c['type']
  520:         # copy the rest
  521:         for k in c.keys():
  522:             l.append((k, c[k]))
  523: 
  524:         return l
  525: 
  526: 
  527: class ArchimResult(MetaResult):
  528:     """archimedes type result object"""
  529: 
  530:     def __init__(self, zope, db_result, rank):
  531:         """constructor"""
  532:         MetaResult.__init__(self, zope, db_result, rank)
  533:         #print "NEW ARCHIM RESULT!", self
  534:         self.type = "archim"
  535:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
  536:         url = storageURL(self.file)
  537:         if url:
  538:             self.urls.append(url)
  539:             
  540:         (fileid, tagidx, tags, content) = db_result
  541: 
  542:         # process info
  543:         bitems = {}
  544:         ctxurl = ['', '']
  545:         for me in self.metainfo:
  546:             (m_idx, m_tags, m_content, m_attributes) = me
  547:             # context tag
  548:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
  549:             # collect archimedes/something
  550:             r = re.search('/meta/archimedes/(.*)', m_tags)
  551:             if r:
  552:                 k = r.group(1)
  553:                 #print "CONTENT: ", m_content
  554:                 bitems[k] = m_content
  555:                 # remember hit tag
  556:                 if m_tags == self.hitTag:
  557:                     self.hitTag = k
  558:                 continue
  559: 
  560:         self.content = bitems
  561:         self.rank += 100
  562:         # store context
  563:         if not ctxurl[1]:
  564:             ctxurl[1] = "View"
  565:         # must have link
  566:         if ctxurl[0]:
  567:             self.urls.append(ctxurl)
  568: 
  569: 
  570:     def getContentList(self):
  571:         """returns content as list of tuples in preferred order"""
  572:         l = []
  573:         c = self.content.copy()
  574:         # preferred items first
  575:         for k in ('author', 'title', 'date', 'place'):
  576:             if c.has_key(k):
  577:                 l.append((k, c[k]))
  578:                 del c[k]
  579: 
  580:         # copy the rest
  581:         for k in c.keys():
  582:             l.append((k, c[k]))
  583: 
  584:         return l
  585: 	
  586: 
  587: 
  588: 
  589: def ranksort(res1, res2):
  590:     """sort results on rank"""
  591:     return cmp(res2.rank, res1.rank)
  592: 
  593: 
  594: def statusForFile(filename):
  595:     """heuristic... returns status for a index file name"""
  596:     status = None
  597:     if filename.startswith('/mpiwg/online/'):
  598:         status = "online"
  599:     elif filename.startswith('/mpiwg/archive/'):
  600:         status = "archive"
  601:     elif filename.startswith('http://'):
  602:         status = "database"
  603:         
  604:     return status
  605: 
  606: def storageURL(filename):
  607:     """heuristic... returns an URL for a index file name"""
  608:     url = None
  609:     name = None
  610:     if filename.startswith('/mpiwg/online/'):
  611:         #print "URLFORFILE: online ", filename
  612:         r = re.search('^(.*)/index.meta', filename)
  613:         if r:
  614:             url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
  615:             name = "Storage System"
  616:             
  617:     elif filename.startswith('http://'):
  618:         #print "URLFORFILE: url ", filename
  619:         url = filename
  620:         name = "Online Database"
  621: 
  622:     if name and url:
  623:         return (url, name)
  624:     
  625:     return None
  626: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>