OSAS/OSA_system/OSAS_search.py - annotate

Return to OSAS_search.py CVS log
Up to [Repository] / OSAS / OSA_system
Annotation of OSAS/OSA_system/OSAS_search.py, revision 1.3

1.1       casties     1: """Metadata search interface
                      2: ROC 2004, itgroup
                      3: 
                      4: """
                      5: 
                      6: from AccessControl import ClassSecurityInfo
                      7: from Globals import InitializeClass
                      8: from Globals import Persistent, package_home
                      9: from Products.PageTemplates.PageTemplateFile import PageTemplateFile
                     10: from Products.PageTemplates.PageTemplate import PageTemplate
                     11: from OFS.SimpleItem import SimpleItem
                     12: #from pyPgSQL import PgSQL
                     13: import psycopg as PgSQL
                     14: 
                     15: import re
                     16: import os
                     17: 
                     18: MAXHITS = 1000
                     19: 
                     20: class OSAS_search(SimpleItem):
                     21:     """Object for global metadata search"""
                     22: 
                     23:     meta_type="OSAS_search"
                     24: 
                     25:     
                     26: 
                     27:     def __init__(self,id,title,dsn=None):
                     28:         """init"""
                     29:         self.id=id
                     30:         self.title=title
                     31:         if dsn:
                     32:             self.dsn = dsn
                     33:         else:
                     34:             self.dsn = "host=foxridge.mpiwg-berlin.mpg.de dbname=storage user=archiveread password=archiveread"
                     35:         # volatile database connection object
                     36:         self._v_dbCon = None
                     37:         self._v_tryCon = 0
                     38: 
                     39: 
                     40:     def dbCursor(self):
                     41:         """returns new SQL cursor object"""
                     42:         curs = None
                     43:         if hasattr(self, '_v_dbCon') and self._v_dbCon is not None:
                     44:             try:
                     45:                 curs = self._v_dbCon.cursor()
                     46:                 self._v_tryCon = 0
                     47:             except:
                     48:                 # in case of problems reset dbCon
                     49:                 self._v_dbCon = None
                     50:                 self._v_tryCon += 1
                     51:         else:
                     52:             self._v_dbCon = None
                     53:             self._v_tryCon = 0
                     54:                 
                     55:         if not curs and self._v_tryCon < 3:
                     56:             self._v_dbCon = PgSQL.connect(self.dsn, serialize=0)
                     57:             # call ourself with the new connection
                     58:             curs = self.dbCursor()
                     59: 
                     60:         assert curs, "AIIEE no db cursor!!"
                     61:         return curs
                     62: 
                     63:     def getDBFileMeta(self, fileid):
                     64:         """returns an array with all meta entries of fileid"""
                     65: 
                     66:         metacache = {}
                     67:         # try in cache
                     68:         if self.REQUEST.SESSION.has_key('dbMeta'):
                     69:             metacache = self.REQUEST.SESSION['dbMeta']
                     70:             if metacache.has_key(fileid):
                     71:                 res = metacache[fileid]
1.2       casties    72:                 #print "meta from cache "
1.1       casties    73:                 return res
                     74: 
                     75:         curs = self.dbCursor()
                     76: 
                     77:         sql = 'SELECT idx,tags,content,attributes FROM meta WHERE fileid=%(id)s ORDER BY idx'
                     78:         print sql, " -> ", fileid
                     79:         curs.execute(sql, {'id':fileid})
                     80:         print "done"
                     81: 
                     82:         res = curs.fetchall()
                     83:         #print "res:", res
                     84:         curs.close()
                     85:         # store info in cache
                     86:         metacache[fileid] = res
                     87:         self.REQUEST.SESSION['dbMeta'] = metacache
                     88: 
                     89:         return res
                     90: 
                     91:     def getDBFile(self, fileid):
                     92:         """returns the file information of fileid"""
                     93: 
                     94:         filecache = {}
                     95:         # try in cache
                     96:         if self.REQUEST.SESSION.has_key('dbFiles'):
                     97:             filecache = self.REQUEST.SESSION['dbFiles']
                     98:             if filecache.has_key(fileid):
                     99:                 res = filecache[fileid]
1.2       casties   100:                 #print "file from cache "
1.1       casties   101:                 return res
                    102: 
                    103:         curs = self.dbCursor()
                    104: 
                    105:         sql = 'select filename,mtime from files where id=%(id)s'
                    106:         print 'DBFILE: ', sql, " -> ", fileid
                    107:         curs.execute(sql, {'id':fileid})
                    108:         print "DBFILE: done"
                    109: 
                    110:         res = curs.fetchone()
                    111:         #print "DBFILE: res:", res
                    112:         curs.close()
                    113:         # store info in cache
                    114:         filecache[fileid] = res
                    115:         self.REQUEST.SESSION['dbFiles'] = filecache
                    116: 
                    117:         return res
                    118:    
                    119:    
1.2       casties   120:     def dbSearch(self, query, type):
1.1       casties   121:         """search DB for query and return result set"""
1.3     ! casties   122:         results = []
        !           123:         restypes = {}
        !           124:         if not query:
        !           125:             # empty query
        !           126:             return results
        !           127:         
1.1       casties   128:         curs = self.dbCursor()
1.2       casties   129:         if type == 'equals':
                    130:             qs = query
                    131:         elif type == 'startswith':
                    132:             qs = query + "%"
                    133:         elif type == 'contains':
                    134:             qs = "%" + query + "%"
                    135:             
1.3     ! casties   136:         sql = 'select fileid,idx,tags,content from meta where lower(content) like lower(%(qs)s)'
1.1       casties   137:         print sql, " -> ", qs
                    138:         curs.execute(sql, {'qs':qs})
                    139:         print "done"
                    140:         res = curs.fetchone()
                    141:         rescnt = 1
                    142:         #print "res0:", res
                    143:         while res and rescnt < MAXHITS:
                    144:             #print "res:", res
                    145:             result = self.getResult(res)
1.3     ! casties   146:             if result:
1.1       casties   147:                 results.append(result)
1.3     ! casties   148:                 restypes[result.type] = result.type
1.1       casties   149:                 
                    150:             res = curs.fetchone()
                    151:             rescnt += 1
                    152: 
                    153:         curs.close()
                    154:         #self.dbCon = None
                    155: 
1.2       casties   156:         #print "SEARCH: ", rescnt, " results"
1.3     ! casties   157:         restypelist = restypes.keys()
        !           158:         return (results, restypelist)
1.1       casties   159: 
                    160:         
                    161:     def getResult(self, db_result, rank=0):
                    162:         """factory for result objects"""
                    163: 
                    164:         (fileid, tagidx, tags, content) = db_result
                    165:         res = None
                    166: 
                    167:         if tags.find('/meta/bib/') > -1:
                    168:             res = BibResult(self, db_result, rank)
                    169:         elif tags.find('/meta/archimedes/') > -1:
                    170:             res = ArchimResult(self, db_result, rank)
                    171:         else:
                    172:             res = AnyResult(self, db_result, rank)
                    173: 
                    174:         return res
1.3     ! casties   175: 
1.1       casties   176:    
                    177:     def renderResult(self, result):
                    178:         """returns HTML rendering of a search result"""
                    179: 
                    180:         return result.render(self)
                    181:    
1.3     ! casties   182: 
        !           183:     def filterResults(self, results, start, end, restypefilter=None):
        !           184:         """returns list of results that match a filter"""
        !           185:         # filter types first
        !           186:         if restypefilter:
        !           187:             res = []
        !           188:             for r in results:
        !           189:                 if r.type in restypefilter:
        !           190:                     res.append(r)
        !           191:         else:
        !           192:             res = results
        !           193:         # filter on count
        !           194:         resgroup = res[start:end]
        !           195:    # new total count (because of filter)
        !           196:         rescnt = len(res)
        !           197: 
        !           198:         return (resgroup, rescnt)
        !           199:     
1.1       casties   200: 
                    201:     #
                    202:     # Web page stuff
                    203:     #
                    204: 
                    205:     def index_html(self):
                    206:         """metadata search"""
                    207:         pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/OSAS_search.zpt")).__of__(self)
                    208:         return pt()
                    209: 
                    210: 
1.3     ! casties   211:     def search(self, searchstring=None, searchtype='startswith', start=1, count=10, restypefilter=None):
1.2       casties   212:         """search and create result"""
                    213:         sres = int(start) -1
                    214:         lres = sres + count
                    215:         try:
                    216:             oldsearch = self.REQUEST.SESSION['searchstring']
                    217:             oldtype = self.REQUEST.SESSION['searchtype']
                    218:         except:
                    219:             oldsearch = ""
                    220:             oldtype = ""
                    221:             
                    222:         if not searchstring:
                    223:             searchstring = oldsearch
                    224:             searchtype = oldtype
                    225:             
                    226:         if not oldsearch or searchstring != oldsearch or searchtype != oldtype:
                    227:             # new search
1.3     ! casties   228:             (res, restypes) = self.dbSearch(searchstring, searchtype)
1.2       casties   229:             # sort the result
1.1       casties   230:             res.sort(ranksort)
1.2       casties   231:             # store it
1.1       casties   232:             self.REQUEST.SESSION['results'] = res
                    233:             self.REQUEST.SESSION['searchstring'] = searchstring
1.2       casties   234:             self.REQUEST.SESSION['searchtype'] = searchtype
1.3     ! casties   235:             self.REQUEST.SESSION['resulttypes'] = restypes
1.1       casties   236: 
1.3     ! casties   237:         (resgroup, nres) = self.filterResults(self.REQUEST.SESSION['results'], sres, lres, restypefilter)
        !           238:         lres = min(lres, nres)
        !           239:         self.REQUEST.SESSION['resultgroup'] = resgroup
        !           240:         self.REQUEST.SESSION['res_indexes'] = (sres+1, lres, nres, int(count))
        !           241:         self.REQUEST.SESSION['res_type_filter'] = restypefilter
        !           242:         if nres > 0:
        !           243:             zpt = "zpt/searchResult.zpt"
        !           244:         else:
        !           245:             zpt = "zpt/searchResult_none.zpt"
1.2       casties   246:             
1.3     ! casties   247:         pt=PageTemplateFile(os.path.join(package_home(globals()), zpt)).__of__(self)
1.1       casties   248:         return pt()
                    249: 
1.2       casties   250: 
                    251:     def getSearchType(self):
                    252:         """returns the last search type"""
                    253:         try:
                    254:             ret = self.REQUEST.SESSION['searchtype']
                    255:         except:
                    256:             ret = ""
                    257: 
                    258:         return ret
                    259:     
                    260:     def getSearchString(self):
                    261:         """returns the last search string"""
                    262:         try:
                    263:             ret = self.REQUEST.SESSION['searchstring']
                    264:         except:
                    265:             ret = ""
                    266: 
                    267:         return ret
                    268:     
                    269: 
                    270:     def hasNextResults(self):
                    271:         """returns if there are more results"""
                    272:         try:
                    273:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
1.3     ! casties   274:             return (first + count < total)
1.2       casties   275:         except:
                    276:             return False
                    277: 
                    278:     def hasPrevResults(self):
                    279:         """returns if there are previous results"""
                    280:         try:
                    281:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    282:             return (first > 1)
                    283:         except:
                    284:             return False
                    285: 
                    286: 
                    287:     def nextResults(self):
                    288:         """returns more results"""
                    289:         try:
                    290:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    291:             first = first + count
                    292:             last = last + count
                    293:             if first > total:
                    294:                 first = total
                    295:             if last > total:
                    296:                 last = total
                    297:         except:
1.3     ! casties   298:             print "OUCH: no next results!"
        !           299:             return self.search()
1.2       casties   300: 
                    301:         return self.search(start=first, count=count)
                    302: 
                    303:         
                    304:     def prevResults(self):
                    305:         """returns more results"""
                    306:         try:
                    307:             (first, last, total, count) = self.REQUEST.SESSION['res_indexes']
                    308:             first = first - count
                    309:             last = last - count
                    310:             if first < 1:
                    311:                 first = 1
                    312:             if last < 1:
                    313:                 last = 1
                    314:         except:
1.3     ! casties   315:             print "OUCH: no prev results!"
        !           316:             return self.search()           
1.2       casties   317: 
                    318:         return self.search(start=first, count=count)
1.1       casties   319:         
1.2       casties   320: 
                    321: 
1.1       casties   322: def manage_AddOSAS_searchForm(self):
                    323:     """create Search form"""
                    324:     pt=PageTemplateFile(os.path.join(package_home(globals()), "zpt/AddOSAS_search.zpt")).__of__(self)
                    325:     return pt()
                    326: 
                    327: def manage_AddOSAS_search(self,id,title=None,dsn=None,RESPONSE=None):
                    328:     """add the OSAS_root"""
                    329:     newObj=OSAS_search(id,title,dsn)
                    330:     self._setObject(id,newObj)
                    331:     if RESPONSE is not None:
                    332:         RESPONSE.redirect('manage_main')
                    333: 
                    334: 
                    335: 
                    336: 
                    337: class SearchResult(SimpleItem):
                    338:     """base search result object"""
                    339: 
                    340:     def __init__(self, type='unknown', file=None, url=None, content=None, rank=0):
                    341:         """init"""
1.3     ! casties   342:         # result type (e.g. "bib", "archim")
1.1       casties   343:         self.type = type
1.3     ! casties   344:         # index file name
1.1       casties   345:         self.file = file
1.3     ! casties   346:         # url for result (list of pairs)
        !           347:         if url:
        !           348:             self.urls = url
        !           349:         else:
        !           350:             self.urls = []
        !           351:         # actual content (list of tuples)
1.1       casties   352:         self.content = content
1.3     ! casties   353:         # document status (e.g. "online", "archive")
        !           354:         self.status = None
        !           355:         # result rank for presentation
1.1       casties   356:         self.rank = rank
                    357: 
                    358: class AnyResult(SearchResult):
                    359:     """catch-all type result object"""
                    360: 
                    361:     def __init__(self, zope, db_result, rank):
                    362:         """returns a catch-all type result"""
1.3     ! casties   363:         SearchResult.__init__(self)
1.2       casties   364:         #print "NEW ANY RESULT!"
1.3     ! casties   365:         self.type='unknown'
1.1       casties   366:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_any.zpt")
                    367:         
1.3     ! casties   368:         (db_fileid, db_tagidx, db_tags, db_content) = db_result
        !           369:         self.hitTag = db_tags
1.1       casties   370: 
                    371:         # get full info from db
1.3     ! casties   372:         self.fileinfo = zope.getDBFile(db_fileid)
1.1       casties   373:         assert self.fileinfo
                    374: 
                    375:         items = {}
1.3     ! casties   376:         items[db_tags] = db_content
1.1       casties   377:         self.content = items
                    378:         self.file = self.fileinfo[0]
1.3     ! casties   379:         self.status = statusForFile(self.file)
1.1       casties   380:         self.rank = rank
                    381: 
1.3     ! casties   382:     def getContentList(self):
        !           383:         """returns content as list of tuples in preferred order"""
        !           384:         l = []
        !           385:         for k in self.content.keys():
        !           386:             l.append((k, self.content[k]))
        !           387: 
        !           388:         return l
        !           389: 
1.1       casties   390:     def render(self, zope):
                    391:         """render this result object"""
                    392:         zope.REQUEST.SESSION['result'] = self
                    393:         pt=PageTemplateFile(self.zptFile).__of__(zope)
                    394:         return pt()
                    395: 
                    396: 
                    397: class MetaResult(AnyResult):
                    398:     """result object that collects metadata"""
                    399: 
                    400:     def __init__(self, zope, db_result, rank):
                    401:         """contructor"""
                    402:         AnyResult.__init__(self, zope, db_result, rank)
1.2       casties   403:         #print "NEW META RESULT!"
1.1       casties   404: 
                    405:         (fileid, tagidx, tags, content) = db_result
                    406: 
                    407:         # get full info from db
                    408:         self.metainfo = zope.getDBFileMeta(fileid)
                    409:         assert self.metainfo
                    410:         
1.3     ! casties   411:     def checkContext(self, tags, content, ctxurl):
        !           412:         """takes meta entry and updates url from context tags"""
1.1       casties   413:         if tags.endswith('/context/link'):
                    414:             if content:
1.3     ! casties   415:                 #print "CTXlink: ", content
        !           416:                 ctxurl[0] = content
1.1       casties   417:             
                    418:         elif tags.endswith('/context/name'):
                    419:             if content:
1.3     ! casties   420:                 #print "CTXname: ", content
        !           421:                 ctxurl[1] = content
1.1       casties   422: 
1.3     ! casties   423:         return ctxurl
1.1       casties   424: 
                    425: 
                    426: class BibResult(MetaResult):
                    427:     """bib type result object"""
                    428: 
                    429:     def __init__(self, zope, db_result, rank):
                    430:         """constructor"""
                    431:         MetaResult.__init__(self, zope, db_result, rank)
1.3     ! casties   432:         #print "NEW BIB RESULT!", self
1.1       casties   433:         self.type = "bib"
                    434:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_bib.zpt")
1.3     ! casties   435:         url = storageURL(self.file)
        !           436:         if url:
        !           437:             self.urls.append(url)
1.1       casties   438:         (fileid, tagidx, tags, content) = db_result
                    439: 
                    440:         btype = ""
                    441:         bitems = {}
1.3     ! casties   442:         ctxurl = ['', '']
1.1       casties   443: 
                    444:         for me in self.metainfo:
                    445:             (m_idx, m_tags, m_content, m_attributes) = me
                    446:             # context tag
1.3     ! casties   447:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1       casties   448:             # first tag with bib type attribute
                    449:             if m_tags.endswith('/meta/bib'):
                    450:                 r = re.search('type="([^"]*)"', m_attributes)
                    451:                 if r:
                    452:                     btype = r.group(1)
                    453: 
                    454:                 if not btype:
                    455:                     btype = "*unknown*"
                    456: 
                    457:                 bitems['type'] = btype
                    458:                 continue
                    459: 
                    460:             # skip other tags
                    461:             if not btype: continue
                    462: 
                    463:             # collect bib/something
                    464:             r = re.search('/meta/bib/(.*)', m_tags)
                    465:             if r:
                    466:                 k = r.group(1)
                    467:                 #print "CONTENT: ", m_content
                    468:                 bitems[k] = m_content
1.3     ! casties   469:                 # remember hit tag
        !           470:                 if m_tags == self.hitTag:
        !           471:                     self.hitTag = k
1.1       casties   472:                 continue
                    473: 
                    474:         self.content = bitems
1.3     ! casties   475:         # store context
        !           476:         if not ctxurl[1]:
        !           477:             ctxurl[1] = "View"
        !           478:         # must have link
        !           479:         if ctxurl[0]:
        !           480:             self.urls.append(ctxurl)
        !           481:                 
1.1       casties   482:         self.rank += 100
1.3     ! casties   483: 
        !           484:     def getContentList(self):
        !           485:         """returns content as list of tuples in preferred order"""
        !           486:         l = []
        !           487:         c = self.content.copy()
        !           488:         # preferred items first
        !           489:         for k in ('author', 'title', 'journal', 'year'):
        !           490:             if c.has_key(k):
        !           491:                 l.append((k, c[k]))
        !           492:                 del c[k]
        !           493: 
        !           494:         # no type
        !           495:         del c['type']
        !           496:         # copy the rest
        !           497:         for k in c.keys():
        !           498:             l.append((k, c[k]))
        !           499: 
        !           500:         return l
1.1       casties   501: 
                    502: 
                    503: class ArchimResult(MetaResult):
                    504:     """archimedes type result object"""
                    505: 
                    506:     def __init__(self, zope, db_result, rank):
                    507:         """constructor"""
                    508:         MetaResult.__init__(self, zope, db_result, rank)
1.3     ! casties   509:         #print "NEW ARCHIM RESULT!", self
1.1       casties   510:         self.type = "archim"
                    511:         self.zptFile = os.path.join(package_home(globals()), "zpt/searchResult_archim.zpt")
1.3     ! casties   512:         url = storageURL(self.file)
        !           513:         if url:
        !           514:             self.urls.append(url)
        !           515:             
1.1       casties   516:         (fileid, tagidx, tags, content) = db_result
                    517: 
                    518:         # process info
                    519:         bitems = {}
1.3     ! casties   520:         ctxurl = ['', '']
1.1       casties   521:         for me in self.metainfo:
                    522:             (m_idx, m_tags, m_content, m_attributes) = me
                    523:             # context tag
1.3     ! casties   524:             ctxurl = self.checkContext(m_tags, m_content, ctxurl)
1.1       casties   525:             # collect archimedes/something
                    526:             r = re.search('/meta/archimedes/(.*)', m_tags)
                    527:             if r:
                    528:                 k = r.group(1)
                    529:                 #print "CONTENT: ", m_content
                    530:                 bitems[k] = m_content
1.3     ! casties   531:                 # remember hit tag
        !           532:                 if m_tags == self.hitTag:
        !           533:                     self.hitTag = k
1.1       casties   534:                 continue
                    535: 
                    536:         self.content = bitems
                    537:         self.rank += 100
1.3     ! casties   538:         # store context
        !           539:         if not ctxurl[1]:
        !           540:             ctxurl[1] = "View"
        !           541:         # must have link
        !           542:         if ctxurl[0]:
        !           543:             self.urls.append(ctxurl)
        !           544: 
        !           545: 
        !           546:     def getContentList(self):
        !           547:         """returns content as list of tuples in preferred order"""
        !           548:         l = []
        !           549:         c = self.content.copy()
        !           550:         # preferred items first
        !           551:         for k in ('author', 'title', 'date', 'place'):
        !           552:             if c.has_key(k):
        !           553:                 l.append((k, c[k]))
        !           554:                 del c[k]
        !           555: 
        !           556:         # copy the rest
        !           557:         for k in c.keys():
        !           558:             l.append((k, c[k]))
        !           559: 
        !           560:         return l
1.1       casties   561:    
                    562: 
                    563: 
                    564: 
                    565: def ranksort(res1, res2):
                    566:     """sort results on rank"""
                    567:     return cmp(res2.rank, res1.rank)
                    568: 
                    569: 
1.3     ! casties   570: def statusForFile(filename):
        !           571:     """heuristic... returns status for a index file name"""
        !           572:     status = None
        !           573:     if filename.startswith('/mpiwg/online/'):
        !           574:         status = "online"
        !           575:     elif filename.startswith('/mpiwg/archive/'):
        !           576:         status = "archive"
        !           577:     elif filename.startswith('http://'):
        !           578:         status = "database"
        !           579:         
        !           580:     return status
        !           581: 
        !           582: def storageURL(filename):
1.1       casties   583:     """heuristic... returns an URL for a index file name"""
                    584:     url = None
1.3     ! casties   585:     name = None
1.1       casties   586:     if filename.startswith('/mpiwg/online/'):
1.3     ! casties   587:         #print "URLFORFILE: online ", filename
        !           588:         r = re.search('^(.*)/index.meta', filename)
1.1       casties   589:         if r:
1.3     ! casties   590:             url = "http://content.mpiwg-berlin.mpg.de/mpistorage/storage/ShowOnline/index_html?path=%s"%r.group(1)
        !           591:             name = "Storage System"
        !           592: 
        !           593:     if name and url:
        !           594:         return (url, name)
        !           595:     
        !           596:     return None
1.1       casties   597:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>