Diff for /cdli/cdli_files.py between versions 1.80.2.7 and 1.80.2.11

version 1.80.2.7, 2007/11/19 15:14:44 version 1.80.2.11, 2007/12/13 19:20:45
Line 2137  class CDLIRoot(Folder): Line 2137  class CDLIRoot(Folder):
         resultset = idx.search(query_request=idxQuery,sort_index='textid')          resultset = idx.search(query_request=idxQuery,sort_index='textid')
         # put only the P-Number in the result           # put only the P-Number in the result 
         results = [res.getId[:7] for res in resultset]          results = [res.getId[:7] for res in resultset]
           logging.debug("searchtext: found %d texts"%len(results))
         return results          return results
   
   
Line 2167  class CDLIRoot(Folder): Line 2168  class CDLIRoot(Folder):
   
     def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):      def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
         """get lines with word from FileId"""          """get lines with word from FileId"""
           logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
                   
         file = formatAtfFullLineNum(self.getFile(fileId))          file = formatAtfFullLineNum(self.getFile(fileId))
         ret=[]          ret=[]
Line 2174  class CDLIRoot(Folder): Line 2176  class CDLIRoot(Folder):
         # add whitespace before and whitespace and line-end to splitter bounds expressions          # add whitespace before and whitespace and line-end to splitter bounds expressions
         bounds = self.splitter[indexName].bounds          bounds = self.splitter[indexName].bounds
         splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)          splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds)
         # compile into regexp objects          # clean word expression 
         wordlist = [re.compile(splitexp%w) for w in word.split(' ')]          # TODO: this should use QueryParser itself
           # take out double quotes
           word = word.replace('"','')
           # take out ignorable signs
           ignorable = self.splitter[indexName].ignorex
           word = ignorable.sub('', word)
           # compile into regexp objects and escape parens
           wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')]
                           
         for line in file.split("\n"):          for line in file.split("\n"):
             for word in wordlist:              for word in wordlist:
                 #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line))                  #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line)))
                 if word.search(line):                  if word.search(ignorable.sub('',line)):
                     line = formatAtfLineHtml(line)                      line = formatAtfLineHtml(line)
                     ret.append(line)                      ret.append(line)
                     break                      break
Line 2188  class CDLIRoot(Folder): Line 2197  class CDLIRoot(Folder):
         return ret          return ret
           
   
       def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
           """
           get lines with word from all ids in list FileIds.
           returns dict with id:lines pairs.
           """
           logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
           
           return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds])
       
   
     def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):      def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
         """get text with word highlighted from FileId"""          """get text with word highlighted from FileId"""
           logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) 
                   
         file=self.getFile(fileId)          file=self.getFile(fileId)
         tagStart=u'<span class="found">'          tagStart=u'<span class="found">'
Line 2200  class CDLIRoot(Folder): Line 2220  class CDLIRoot(Folder):
         # add whitespace to splitter bounds expressions and compile into regexp object          # add whitespace to splitter bounds expressions and compile into regexp object
         bounds = self.splitter[indexName].bounds          bounds = self.splitter[indexName].bounds
         wordsplit = re.compile("(%s|\s)"%bounds)          wordsplit = re.compile("(%s|\s)"%bounds)
           # clean word expression 
           # TODO: this should use QueryParser itself
           word = word.replace('"','') # take out double quotes
           # take out ignoreable signs
           ignorable = self.splitter[indexName].ignorex
           word = ignorable.sub('', word)
         # split search terms by blanks          # split search terms by blanks
         words = word.split(' ')          words = word.split(' ')
           # split search terms again (for grapheme search with words)
           splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words))
                           
         for line in file.split("\n"):          for line in file.split("\n"):
             line = unicodify(line)              line = unicodify(line)
Line 2212  class CDLIRoot(Folder): Line 2240  class CDLIRoot(Folder):
             # first scan              # first scan
             hitwords = []              hitwords = []
             for w in words:              for w in words:
                 if line.find(w) > -1:                  if ignorable.sub('',line).find(w) > -1:
                     # word is in line                      # word is in line
                     hitwords.append(w)                      # append split word for grapheme search with words
                       hitwords.extend(splitwords[w])
                       #hitwords.extend(wordsplit.split(w))
                                         
             # examine hits closer              # examine hits closer
             if hitwords:              if hitwords:
Line 2222  class CDLIRoot(Folder): Line 2252  class CDLIRoot(Folder):
                 parts = wordsplit.split(line)                  parts = wordsplit.split(line)
                 line = ""                  line = ""
                 for p in parts:                  for p in parts:
                       #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords))
                     # reassemble line                      # reassemble line
                     if p in hitwords:                      if ignorable.sub('', p) in hitwords:
                           #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords))
                         # this part was found                          # this part was found
                         line += tagStart + formatAtfHtml(p) + tagEnd                          line += tagStart + formatAtfHtml(p) + tagEnd
                     else:                      else:
Line 2238  class CDLIRoot(Folder): Line 2270  class CDLIRoot(Folder):
         return u'<br>\n'.join(ret)          return u'<br>\n'.join(ret)
           
   
   
       def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
           """
           get texts with highlighted word from all ids in list FileIds.
           returns dict with id:text pairs.
           """
           logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) 
           return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds])
       
   
     def URLquote(self,str):      def URLquote(self,str):
         """quote url"""          """quote url"""
         return urllib.quote(str)          return urllib.quote(str)

Removed from v.1.80.2.7  
changed lines
  Added in v.1.80.2.11


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>