--- cdli/cdli_files.py 2007/11/19 15:14:44 1.80.2.7 +++ cdli/cdli_files.py 2007/12/13 19:20:45 1.80.2.11 @@ -2137,6 +2137,7 @@ class CDLIRoot(Folder): resultset = idx.search(query_request=idxQuery,sort_index='textid') # put only the P-Number in the result results = [res.getId[:7] for res in resultset] + logging.debug("searchtext: found %d texts"%len(results)) return results @@ -2167,6 +2168,7 @@ class CDLIRoot(Folder): def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): """get lines with word from FileId""" + logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) file = formatAtfFullLineNum(self.getFile(fileId)) ret=[] @@ -2174,22 +2176,40 @@ class CDLIRoot(Folder): # add whitespace before and whitespace and line-end to splitter bounds expressions bounds = self.splitter[indexName].bounds splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) - # compile into regexp objects - wordlist = [re.compile(splitexp%w) for w in word.split(' ')] + # clean word expression + # TODO: this should use QueryParser itself + # take out double quotes + word = word.replace('"','') + # take out ignorable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) + # compile into regexp objects and escape parens + wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] for line in file.split("\n"): for word in wordlist: - #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) - if word.search(line): + #logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line))) + if word.search(ignorable.sub('',line)): line = formatAtfLineHtml(line) ret.append(line) break return ret + + + def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get lines with word from all ids in list FileIds. + returns dict with id:lines pairs. + """ + logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + + return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): """get text with word highlighted from FileId""" + logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) file=self.getFile(fileId) tagStart=u'' @@ -2200,8 +2220,16 @@ class CDLIRoot(Folder): # add whitespace to splitter bounds expressions and compile into regexp object bounds = self.splitter[indexName].bounds wordsplit = re.compile("(%s|\s)"%bounds) + # clean word expression + # TODO: this should use QueryParser itself + word = word.replace('"','') # take out double quotes + # take out ignoreable signs + ignorable = self.splitter[indexName].ignorex + word = ignorable.sub('', word) # split search terms by blanks words = word.split(' ') + # split search terms again (for grapheme search with words) + splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words)) for line in file.split("\n"): line = unicodify(line) @@ -2212,9 +2240,11 @@ class CDLIRoot(Folder): # first scan hitwords = [] for w in words: - if line.find(w) > -1: + if ignorable.sub('',line).find(w) > -1: # word is in line - hitwords.append(w) + # append split word for grapheme search with words + hitwords.extend(splitwords[w]) + #hitwords.extend(wordsplit.split(w)) # examine hits closer if hitwords: @@ -2222,8 +2252,10 @@ class CDLIRoot(Folder): parts = wordsplit.split(line) line = "" for p in parts: + #logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords)) # reassemble line - if p in hitwords: + if ignorable.sub('', p) in hitwords: + #logging.debug("tagwordinfile: found %s in %s"%(p,hitwords)) # this part was found line += tagStart + formatAtfHtml(p) + tagEnd else: @@ -2236,6 +2268,16 @@ class CDLIRoot(Folder): ret.append(line) return u'
\n'.join(ret) + + + + def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): + """ + get texts with highlighted word from all ids in list FileIds. + returns dict with id:text pairs. + """ + logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) + return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) def URLquote(self,str):