--- cdli/cdli_files.py 2007/11/27 10:12:58 1.80.2.8 +++ cdli/cdli_files.py 2007/12/03 21:30:19 1.80.2.10 @@ -2137,6 +2137,7 @@ class CDLIRoot(Folder): resultset = idx.search(query_request=idxQuery,sort_index='textid') # put only the P-Number in the result results = [res.getId[:7] for res in resultset] + logging.debug("searchtext: found %d texts"%len(results)) return results @@ -2175,8 +2176,12 @@ class CDLIRoot(Folder): # add whitespace before and whitespace and line-end to splitter bounds expressions bounds = self.splitter[indexName].bounds splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes + # escape parens for regexp too # compile into regexp objects - wordlist = [re.compile(splitexp%w) for w in word.split(' ')] + wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] for line in file.split("\n"): for word in wordlist: @@ -2190,9 +2195,13 @@ class CDLIRoot(Folder): def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): - """get lines with word from all ids in list FileIds""" - logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) - return [self.showWordInFile(id, word, indexName, regExp) for id in fileIds] + """ + get lines with word from all ids in list FileIds. + returns dict with id:lines pairs. + """ + logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + + return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): @@ -2208,6 +2217,9 @@ class CDLIRoot(Folder): # add whitespace to splitter bounds expressions and compile into regexp object bounds = self.splitter[indexName].bounds wordsplit = re.compile("(%s|\s)"%bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes # split search terms by blanks words = word.split(' ') @@ -2248,9 +2260,12 @@ class CDLIRoot(Folder): def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): - """get texts with word from all ids in list FileIds""" + """ + get texts with highlighted word from all ids in list FileIds. + returns dict with id:text pairs. + """ logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) - return [self.tagWordInFile(id, word, indexName, regExp) for id in fileIds] + return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) def URLquote(self,str):