OSAS/OSA_system/OSAS_search.py - annotate

Return to OSAS_search.py CVS log
Up to [Repository] / OSAS / OSA_system
Annotation of OSAS/OSA_system/OSAS_search.py, revision 1.8

1.1       casties     1: """Metadata search interface
                      2: ROC 2004, itgroup
                      3: 
                      4: """
                      5: 
                      6: from AccessControl import ClassSecurityInfo
                      7: from Globals import InitializeClass
                      8: from Globals import Persistent, package_home
                      9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
                     10: from Products.PageTemplates.PageTemplate import PageTemplate
1.6       casties    11: from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
                     12: from OFS.Folder import Folder
1.1       casties    13: from OFS.SimpleItem import SimpleItem
1.8     ! casties    14: try:
        !            15:     import psycopg2 as PgSQL
        !            16: except:
        !            17:     try:
        !            18:         import psycopg as PgSQL
        !            19:     except:
        !            20:         from pyPgSQL import PgSQL
1.1       casties    21: 
                     22: import re
                     23: import os
                     24: 
                     25: MAXHITS = 1000
                     26: 
1.6       casties    27: class OSAS_search(Folder):
1.1       casties    28:     """Object for global metadata search"""
                     29: 
                     30:     meta_type="OSAS_search"
                     31: 
1.6       casties    32:     manage_options=Folder.manage_options+(
                     33:         {'label':'Main config','action':'manage_ChangeOSAS_searchForm'},
                     34:        )
1.1       casties    35:     
                     36: 
                     37:     def __init__(self,id,title,dsn=None):
                     38:         """init"""
                     39:         self.id=id
                     40:         self.title=title
                     41:         if dsn:
                     42:             self.dsn = dsn
                     43:         else:
                     44:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
                     45:         # volatile database connection object
                     46:         self._v_dbCon = None
                     47:         self._v_tryCon = 0
                     48: 
                     49: 
                     50:     def dbCursor(self):
                     51:         """returns new SQL cursor object"""
                     52:         curs = None
                     53:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
                     54:             try:
                     55:                 curs = self._v_dbCon.cursor()
                     56:                 self._v_tryCon = 0
                     57:             except:
                     58:                 # in case of problems reset dbCon
                     59:                 self._v_dbCon = None
                     60:                 self._v_tryCon += 1
                     61:         else:
                     62:             self._v_dbCon = None
                     63:             self._v_tryCon = 0
                     64:                 
                     65:         if not curs and self._v_tryCon < 3:
                     66:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
                     67:             # call ourself with the new connection
                     68:             curs = self.dbCursor()
                     69: 
                     70:         assert curs, "AIIEE no db cursor!!"
                     71:         return curs
                     72: 
                     73:     def getDBFileMeta(self, fileid):
                     74:         """returns an array with all meta entries of fileid"""
                     75: 
                     76:         metacache = {}
                     77:         # try in cache
                     78:         if self.REQUEST.SESSION.has_key('dbMeta'):
                     79:             metacache = self.REQUEST.SESSION['dbMeta']
                     80:             if metacache.has_key(fileid):
                     81:                 res = metacache[fileid]
1.2       casties    82:                 #print "meta from cache "
1.1       casties    83:                 return res
                     84: 
                     85:         curs = self.dbCursor()
                     86: 
                     87:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
                     88:         print sql, " -> ", fileid
                     89:         curs.execute(sql, {'id':fileid})
                     90:         print "done"
                     91: 
                     92:         res = curs.fetchall()
                     93:         #print "res:", res
                     94:         curs.close()
                     95:         # store info in cache
                     96:         metacache[fileid] = res
                     97:         self.REQUEST.SESSION['dbMeta'] = metacache
                     98: 
                     99:         return res
                    100: 
                    101:     def getDBFile(self, fileid):
                    102:         """returns the file information of fileid"""
                    103: 
                    104:         filecache = {}
                    105:         # try in cache
                    106:         if self.REQUEST.SESSION.has_key('dbFiles'):
                    107:             filecache = self.REQUEST.SESSION['dbFiles']
                    108:             if filecache.has_key(fileid):
                    109:                 res = filecache[fileid]
1.2       casties   110:                 #print "file from cache "
1.1       casties   111:                 return res
                    112: 
                    113:         curs = self.dbCursor()
                    114: 
                    115:         sql = 'select filename,mtime from files where id=%(id)s'
                    116:         print 'DBFILE: ', sql, " -> ", fileid
                    117:         curs.execute(sql, {'id':fileid})
                    118:         print "DBFILE: done"
                    119: 
                    120:         res = curs.fetchone()
                    121:         #print "DBFILE: res:", res
                    122:         curs.close()
                    123:         # store info in cache
                    124:         filecache[fileid] = res
                    125:         self.REQUEST.SESSION['dbFiles'] = filecache
                    126: 
                    127:         return res
                    128:    
                    129:    
1.2       casties   130:     def dbSearch(self, query, type):
1.1       casties   131:         """search DB for query and return result set"""
1.3       casties   132:         results = []
                    133:         restypes = {}
                    134:         if not query:
                    135:             # empty query
                    136:             return results
                    137:         
1.1       casties   138:         curs = self.dbCursor()
1.2       casties   139:         if type == 'equals':
                    140:             qs = query
                    141:         elif type == 'startswith':
                    142:             qs = query + "%"
                    143:         elif type == 'contains':
                    144:             qs = "%" + query + "%"
                    145:             
1.3       casties   146:         sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
1.1       casties   147:         print sql, " -> ", qs
                    148:         curs.execute(sql, {'qs':qs})
                    149:         print "done"
                    150:         res = curs.fetchone()
                    151:         rescnt = 1
                    152:         #print "res0:", res
                    153:         while res and rescnt < MAXHITS:
                    154:             #print "res:", res
                    155:             result = self.getResult(res)
1.3       casties   156:             if result:
1.1       casties   157:                 results.append(result)
1.3       casties   158:                 restypes[result.type] = result.type
1.1       casties   159:                 
                    160:             res = curs.fetchone()
                    161:             rescnt += 1
                    162: 
                    163:         curs.close()
                    164:         #self.dbCon = None
                    165: 
1.2       casties   166:         #print "SEARCH: ", rescnt, " results"
1.3       casties   167:         restypelist = restypes.keys()
                    168:         return (results, restypelist)
1.1       casties   169: 
                    170:         
                    171:     def getResult(self, db_result, rank=0):
                    172:         """factory for result objects"""
                    173: 
                    174:         (fileid, tagidx, tags, content) = db_result
                    175:         res = None
                    176: 
                    177:         if tags.find('/meta/bib/') > -1:
                    178:             res = BibResult(self, db_result, rank)
                    179:         elif tags.find('/meta/archimedes/') > -1:
                    180:             res = ArchimResult(self, db_result, rank)
                    181:         else:
                    182:             res = AnyResult(self, db_result, rank)
                    183: 
                    184:         return res
1.3       casties   185: 
1.1       casties   186:    
                    187:     def renderResult(self, result):
                    188:         """returns HTML rendering of a search result"""
                    189: 
                    190:         return result.render(self)
                    191:    
1.3       casties   192: 
                    193:     def filterResults(self, results, start, end, restypefilter=None):
                    194:         """returns list of results that match a filter"""
                    195:         # filter types first
                    196:         if restypefilter:
                    197:             res = []
                    198:             for r in results:
1.7       casties   199:                 if r.type == restypefilter:
1.3       casties   200:                     res.append(r)
                    201:         else:
                    202:             res = results
1.5       casties   203:    # new total count (because of filter)
                    204:         rescnt = len(res)
1.3       casties   205:         # filter on count
                    206:         resgroup = res[start:end]
                    207: 
                    208:         return (resgroup, rescnt)
                    209:     
1.1       casties   210: 
                    211:     #
                    212:     # Web page stuff
                    213:     #
                    214: 
                    215:     def index_html(self):
                    216:         """metadata search"""
                    217:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
                    218:         return pt()
                    219: 
                    220: 
1.3       casties   221:     def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
1.2       casties   222:         """search and create result"""
                    223:         sres = int(start) -1
                    224:         lres = sres + count
                    225:         try:
                    226:             oldsearch = self.REQUEST.SESSION['searchstring']
                    227:             oldtype = self.REQUEST.SESSION['searchtype']
                    228:         except:
                    229:             oldsearch = ""
                    230:             oldtype = ""
                    231:             
                    232:         if not searchstring:
                    233:             searchstring = oldsearch
                    234:             searchtype = oldtype
                    235:             
                    236:         if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
                    237:             # new search
1.3       casties   238:             (res, restypes) = self.dbSearch(searchstring, searchtype)
1.2       casties   239:             # sort the result
1.1       casties   240:             res.sort(ranksort)
1.2       casties   241:             # store it
1.1       casties   242:             self.REQUEST.SESSION['results'] = res
                    243:             self.REQUEST.SESSION['searchstring'] = searchstring
1.2       casties   244:             self.REQUEST.SESSION['searchtype'] = searchtype
1.3       casties   245:             self.REQUEST.SESSION['resulttypes'] = restypes
1.1       casties   246: 
1.3       casties   247:         (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
                    248:         lres = min(lres, nres)
1.5       casties   249:         sres = min(sres, nres)
1.3       casties   250:         self.REQUEST.SESSION['resultgroup'] = resgroup
                    251:         self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
                    252:         self.REQUEST.SESSION['res_type_filter'] = restypefilter
                    253:         if nres > 0:
                    254:             zpt = "zpt/searchResult.zpt"
                    255:         else:
                    256:             zpt = "zpt/searchResult_none.zpt"
1.2       casties   257:             
1.3       casties   258:         pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
1.1       casties   259:         return pt()
                    260: 
1.2       casties   261: 
                    262:     def getSearchType(self):
                    263:         """returns the last search type"""
                    264:         try:
                    265:             ret = self.REQUEST.SESSION['searchtype']
                    266:         except:
                    267:             ret = ""
                    268: 
                    269:         return ret
                    270:     
                    271:     def getSearchString(self):
                    272:         """returns the last search string"""
                    273:         try:
                    274:             ret = self.REQUEST.SESSION['searchstring']
                    275:         except:
                    276:             ret = ""
                    277: 
                    278:         return ret
                    279:     
                    280: 
                    281:     def hasNextResults(self):
                    282:         """returns if there are more results"""
                    283:         try:
                    284:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
1.3       casties   285:             return (first + count < total)
1.2       casties   286:         except:
                    287:             return False
                    288: 
                    289:     def hasPrevResults(self):
                    290:         """returns if there are previous results"""
                    291:         try:
                    292:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    293:             return (first > 1)
                    294:         except:
                    295:             return False
                    296: 
                    297: 
                    298:     def nextResults(self):
                    299:         """returns more results"""
                    300:         try:
                    301:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    302:             first = first + count
                    303:             last = last + count
                    304:             if first > total:
                    305:                 first = total
                    306:             if last > total:
                    307:                 last = total
                    308:         except:
1.3       casties   309:             print "OUCH: no next results!"
                    310:             return self.search()
1.2       casties   311: 
                    312:         return self.search(start=first, count=count)
                    313: 
                    314:         
                    315:     def prevResults(self):
                    316:         """returns more results"""
                    317:         try:
                    318:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    319:             first = first - count
                    320:             last = last - count
                    321:             if first < 1:
                    322:                 first = 1
                    323:             if last < 1:
                    324:                 last = 1
                    325:         except:
1.3       casties   326:             print "OUCH: no prev results!"
                    327:             return self.search()           
1.2       casties   328: 
                    329:         return self.search(start=first, count=count)
1.1       casties   330:         
1.6       casties   331: 
                    332:     def manage_ChangeOSAS_searchForm(self):
                    333:         """create Search form"""
                    334:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/ChangeOSAS_search.zpt")).__of__(self)
                    335:         return pt()
                    336: 
                    337:     def manage_ChangeOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
                    338:         """add the OSAS_root"""
                    339:         self.id = id
                    340:         self.title = title
                    341:         self.dsn = dsn
                    342:         if RESPONSE is not None:
                    343:             RESPONSE.redirect('manage_main')
1.2       casties   344: 
                    345: 
1.1       casties   346: def manage_AddOSAS_searchForm(self):
                    347:     """create Search form"""
                    348:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
                    349:     return pt()
                    350: 
                    351: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
                    352:     """add the OSAS_root"""
                    353:     newObj=OSAS_search(id,title,dsn)
                    354:     self._setObject(id,newObj)
                    355:     if RESPONSE is not None:
                    356:         RESPONSE.redirect('manage_main')
                    357: 
                    358: 
                    359: 
                    360: 
                    361: class SearchResult(SimpleItem):
                    362:     """base search result object"""
                    363: 
                    364:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
                    365:         """init"""
1.3       casties   366:         # result type (e.g. "bib", "archim")
1.1       casties   367:         self.type = type
1.3       casties   368:         # index file name
1.1       casties   369:         self.file = file
1.3       casties   370:         # url for result (list of pairs)
                    371:         if url:
                    372:             self.urls = url
                    373:         else:
                    374:             self.urls = []
                    375:         # actual content (list of tuples)
1.1       casties   376:         self.content = content
1.3       casties   377:         # document status (e.g. "online", "archive")
                    378:         self.status = None
                    379:         # result rank for presentation
1.1       casties   380:         self.rank = rank
                    381: 
                    382: class AnyResult(SearchResult):
                    383:     """catch-all type result object"""
                    384: 
                    385:     def __init__(self, zope, db_result, rank):
                    386:         """returns a catch-all type result"""
1.3       casties   387:         SearchResult.__init__(self)
1.2       casties   388:         #print "NEW ANY RESULT!"
1.3       casties   389:         self.type='unknown'
1.1       casties   390:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
                    391:         
1.3       casties   392:         (db_fileid, db_tagidx, db_tags, db_content) = db_result
                    393:         self.hitTag = db_tags
1.1       casties   394: 
                    395:         # get full info from db
1.3       casties   396:         self.fileinfo = zope.getDBFile(db_fileid)
1.1       casties   397:         assert self.fileinfo
                    398: 
                    399:         items = {}
1.3       casties   400:         items[db_tags] = db_content
1.1       casties   401:         self.content = items
                    402:         self.file = self.fileinfo[0]
1.3       casties   403:         self.status = statusForFile(self.file)
1.1       casties   404:         self.rank = rank
                    405: 
1.3       casties   406:     def getContentList(self):
                    407:         """returns content as list of tuples in preferred order"""
                    408:         l = []
                    409:         for k in self.content.keys():
                    410:             l.append((k, self.content[k]))
                    411: 
                    412:         return l
                    413: 
1.1       casties   414:     def render(self, zope):
                    415:         """render this result object"""
                    416:         zope.REQUEST.SESSION['result'] = self
                    417:         pt=PageTemplateFile(self.zptFile).__of__(zope)
                    418:         return pt()
                    419: 
                    420: 
                    421: class MetaResult(AnyResult):
                    422:     """result object that collects metadata"""
                    423: 
                    424:     def __init__(self, zope, db_result, rank):
                    425:         """contructor"""
                    426:         AnyResult.__init__(self, zope, db_result, rank)
1.2       casties   427:         #print "NEW META RESULT!"
1.1       casties   428: 
                    429:         (fileid, tagidx, tags, content) = db_result
                    430: 
                    431:         # get full info from db
                    432:         self.metainfo = zope.getDBFileMeta(fileid)
                    433:         assert self.metainfo
                    434:         
1.3       casties   435:     def checkContext(self, tags, content, ctxurl):
                    436:         """takes meta entry and updates url from context tags"""
1.1       casties   437:         if tags.endswith('/context/link'):
                    438:             if content:
1.3       casties   439:                 #print "CTXlink: ", content
                    440:                 ctxurl[0] = content
1.1       casties   441:             
                    442:         elif tags.endswith('/context/name'):
                    443:             if content:
1.3       casties   444:                 #print "CTXname: ", content
                    445:                 ctxurl[1] = content
1.1       casties   446: 
1.3       casties   447:         return ctxurl
1.1       casties   448: 
                    449: 
                    450: class BibResult(MetaResult):
                    451:     """bib type result object"""
                    452: 
                    453:     def __init__(self, zope, db_result, rank):
                    454:         """constructor"""
                    455:         MetaResult.__init__(self, zope, db_result, rank)
1.3       casties   456:         #print "NEW BIB RESULT!", self
1.1       casties   457:         self.type = "bib"
                    458:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
1.3       casties   459:         url = storageURL(self.file)
                    460:         if url:
                    461:             self.urls.append(url)
1.1       casties   462:         (fileid, tagidx, tags, content) = db_result
                    463: 
                    464:         btype = ""
                    465:         bitems = {}
1.3       casties   466:         ctxurl = ['', '']
1.1       casties   467: 
                    468:         for me in self.metainfo:
                    469:             (m_idx, m_tags, m_content, m_attributes) = me
                    470:             # context tag
1.3       casties   471:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1       casties   472:             # first tag with bib type attribute
                    473:             if m_tags.endswith('/meta/bib'):
                    474:                 r = re.search('type="([^"]*)"', m_attributes)
                    475:                 if r:
                    476:                     btype = r.group(1)
                    477: 
                    478:                 if not btype:
                    479:                     btype = "*unknown*"
                    480: 
                    481:                 bitems['type'] = btype
                    482:                 continue
                    483: 
                    484:             # skip other tags
                    485:             if not btype: continue
                    486: 
                    487:             # collect bib/something
                    488:             r = re.search('/meta/bib/(.*)', m_tags)
                    489:             if r:
                    490:                 k = r.group(1)
                    491:                 #print "CONTENT: ", m_content
                    492:                 bitems[k] = m_content
1.3       casties   493:                 # remember hit tag
                    494:                 if m_tags == self.hitTag:
                    495:                     self.hitTag = k
1.1       casties   496:                 continue
                    497: 
                    498:         self.content = bitems
1.3       casties   499:         # store context
                    500:         if not ctxurl[1]:
                    501:             ctxurl[1] = "View"
                    502:         # must have link
                    503:         if ctxurl[0]:
                    504:             self.urls.append(ctxurl)
                    505:                 
1.1       casties   506:         self.rank += 100
1.3       casties   507: 
                    508:     def getContentList(self):
                    509:         """returns content as list of tuples in preferred order"""
                    510:         l = []
                    511:         c = self.content.copy()
                    512:         # preferred items first
                    513:         for k in ('author', 'title', 'journal', 'year'):
                    514:             if c.has_key(k):
                    515:                 l.append((k, c[k]))
                    516:                 del c[k]
                    517: 
                    518:         # no type
                    519:         del c['type']
                    520:         # copy the rest
                    521:         for k in c.keys():
                    522:             l.append((k, c[k]))
                    523: 
                    524:         return l
1.1       casties   525: 
                    526: 
                    527: class ArchimResult(MetaResult):
                    528:     """archimedes type result object"""
                    529: 
                    530:     def __init__(self, zope, db_result, rank):
                    531:         """constructor"""
                    532:         MetaResult.__init__(self, zope, db_result, rank)
1.3       casties   533:         #print "NEW ARCHIM RESULT!", self
1.1       casties   534:         self.type = "archim"
                    535:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
1.3       casties   536:         url = storageURL(self.file)
                    537:         if url:
                    538:             self.urls.append(url)
                    539:             
1.1       casties   540:         (fileid, tagidx, tags, content) = db_result
                    541: 
                    542:         # process info
                    543:         bitems = {}
1.3       casties   544:         ctxurl = ['', '']
1.1       casties   545:         for me in self.metainfo:
                    546:             (m_idx, m_tags, m_content, m_attributes) = me
                    547:             # context tag
1.3       casties   548:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1       casties   549:             # collect archimedes/something
                    550:             r = re.search('/meta/archimedes/(.*)', m_tags)
                    551:             if r:
                    552:                 k = r.group(1)
                    553:                 #print "CONTENT: ", m_content
                    554:                 bitems[k] = m_content
1.3       casties   555:                 # remember hit tag
                    556:                 if m_tags == self.hitTag:
                    557:                     self.hitTag = k
1.1       casties   558:                 continue
                    559: 
                    560:         self.content = bitems
                    561:         self.rank += 100
1.3       casties   562:         # store context
                    563:         if not ctxurl[1]:
                    564:             ctxurl[1] = "View"
                    565:         # must have link
                    566:         if ctxurl[0]:
                    567:             self.urls.append(ctxurl)
                    568: 
                    569: 
                    570:     def getContentList(self):
                    571:         """returns content as list of tuples in preferred order"""
                    572:         l = []
                    573:         c = self.content.copy()
                    574:         # preferred items first
                    575:         for k in ('author', 'title', 'date', 'place'):
                    576:             if c.has_key(k):
                    577:                 l.append((k, c[k]))
                    578:                 del c[k]
                    579: 
                    580:         # copy the rest
                    581:         for k in c.keys():
                    582:             l.append((k, c[k]))
                    583: 
                    584:         return l
1.1       casties   585:    
                    586: 
                    587: 
                    588: 
                    589: def ranksort(res1, res2):
                    590:     """sort results on rank"""
                    591:     return cmp(res2.rank, res1.rank)
                    592: 
                    593: 
1.3       casties   594: def statusForFile(filename):
                    595:     """heuristic... returns status for a index file name"""
                    596:     status = None
                    597:     if filename.startswith('/mpiwg/online/'):
                    598:         status = "online"
                    599:     elif filename.startswith('/mpiwg/archive/'):
                    600:         status = "archive"
                    601:     elif filename.startswith('http://'):
                    602:         status = "database"
                    603:         
                    604:     return status
                    605: 
                    606: def storageURL(filename):
1.1       casties   607:     """heuristic... returns an URL for a index file name"""
                    608:     url = None
1.3       casties   609:     name = None
1.1       casties   610:     if filename.startswith('/mpiwg/online/'):
1.3       casties   611:         #print "URLFORFILE: online ", filename
                    612:         r = re.search('^(.*)/index.meta', filename)
1.1       casties   613:         if r:
1.3       casties   614:             url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
                    615:             name = "Storage System"
1.4       casties   616:             
                    617:     elif filename.startswith('http://'):
                    618:         #print "URLFORFILE: url ", filename
                    619:         url = filename
                    620:         name = "Online Database"
1.3       casties   621: 
                    622:     if name and url:
                    623:         return (url, name)
                    624:     
                    625:     return None
1.1       casties   626:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>