version 1.80.2.7, 2007/11/19 15:14:44
|
version 1.80.2.11, 2007/12/13 19:20:45
|
Line 2137 class CDLIRoot(Folder):
|
Line 2137 class CDLIRoot(Folder):
|
resultset = idx.search(query_request=idxQuery,sort_index='textid') |
resultset = idx.search(query_request=idxQuery,sort_index='textid') |
# put only the P-Number in the result |
# put only the P-Number in the result |
results = [res.getId[:7] for res in resultset] |
results = [res.getId[:7] for res in resultset] |
|
logging.debug("searchtext: found %d texts"%len(results)) |
return results |
return results |
|
|
|
|
Line 2167 class CDLIRoot(Folder):
|
Line 2168 class CDLIRoot(Folder):
|
|
|
def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): |
def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,): |
"""get lines with word from FileId""" |
"""get lines with word from FileId""" |
|
logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) |
|
|
file = formatAtfFullLineNum(self.getFile(fileId)) |
file = formatAtfFullLineNum(self.getFile(fileId)) |
ret=[] |
ret=[] |
Line 2174 class CDLIRoot(Folder):
|
Line 2176 class CDLIRoot(Folder):
|
# add whitespace before and whitespace and line-end to splitter bounds expressions |
# add whitespace before and whitespace and line-end to splitter bounds expressions |
bounds = self.splitter[indexName].bounds |
bounds = self.splitter[indexName].bounds |
splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) |
splitexp = "(%s|\s)(%%s)(%s|\s|\Z)"%(bounds,bounds) |
# compile into regexp objects |
# clean word expression |
wordlist = [re.compile(splitexp%w) for w in word.split(' ')] |
# TODO: this should use QueryParser itself |
|
# take out double quotes |
|
word = word.replace('"','') |
|
# take out ignorable signs |
|
ignorable = self.splitter[indexName].ignorex |
|
word = ignorable.sub('', word) |
|
# compile into regexp objects and escape parens |
|
wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')] |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
for word in wordlist: |
for word in wordlist: |
#logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line)) |
#logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line))) |
if word.search(line): |
if word.search(ignorable.sub('',line)): |
line = formatAtfLineHtml(line) |
line = formatAtfLineHtml(line) |
ret.append(line) |
ret.append(line) |
break |
break |
Line 2188 class CDLIRoot(Folder):
|
Line 2197 class CDLIRoot(Folder):
|
return ret |
return ret |
|
|
|
|
|
def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): |
|
""" |
|
get lines with word from all ids in list FileIds. |
|
returns dict with id:lines pairs. |
|
""" |
|
logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) |
|
|
|
return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds]) |
|
|
|
|
def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): |
def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False): |
"""get text with word highlighted from FileId""" |
"""get text with word highlighted from FileId""" |
|
logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId)) |
|
|
file=self.getFile(fileId) |
file=self.getFile(fileId) |
tagStart=u'<span class="found">' |
tagStart=u'<span class="found">' |
Line 2200 class CDLIRoot(Folder):
|
Line 2220 class CDLIRoot(Folder):
|
# add whitespace to splitter bounds expressions and compile into regexp object |
# add whitespace to splitter bounds expressions and compile into regexp object |
bounds = self.splitter[indexName].bounds |
bounds = self.splitter[indexName].bounds |
wordsplit = re.compile("(%s|\s)"%bounds) |
wordsplit = re.compile("(%s|\s)"%bounds) |
|
# clean word expression |
|
# TODO: this should use QueryParser itself |
|
word = word.replace('"','') # take out double quotes |
|
# take out ignoreable signs |
|
ignorable = self.splitter[indexName].ignorex |
|
word = ignorable.sub('', word) |
# split search terms by blanks |
# split search terms by blanks |
words = word.split(' ') |
words = word.split(' ') |
|
# split search terms again (for grapheme search with words) |
|
splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words)) |
|
|
for line in file.split("\n"): |
for line in file.split("\n"): |
line = unicodify(line) |
line = unicodify(line) |
Line 2212 class CDLIRoot(Folder):
|
Line 2240 class CDLIRoot(Folder):
|
# first scan |
# first scan |
hitwords = [] |
hitwords = [] |
for w in words: |
for w in words: |
if line.find(w) > -1: |
if ignorable.sub('',line).find(w) > -1: |
# word is in line |
# word is in line |
hitwords.append(w) |
# append split word for grapheme search with words |
|
hitwords.extend(splitwords[w]) |
|
#hitwords.extend(wordsplit.split(w)) |
|
|
# examine hits closer |
# examine hits closer |
if hitwords: |
if hitwords: |
Line 2222 class CDLIRoot(Folder):
|
Line 2252 class CDLIRoot(Folder):
|
parts = wordsplit.split(line) |
parts = wordsplit.split(line) |
line = "" |
line = "" |
for p in parts: |
for p in parts: |
|
#logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords)) |
# reassemble line |
# reassemble line |
if p in hitwords: |
if ignorable.sub('', p) in hitwords: |
|
#logging.debug("tagwordinfile: found %s in %s"%(p,hitwords)) |
# this part was found |
# this part was found |
line += tagStart + formatAtfHtml(p) + tagEnd |
line += tagStart + formatAtfHtml(p) + tagEnd |
else: |
else: |
Line 2238 class CDLIRoot(Folder):
|
Line 2270 class CDLIRoot(Folder):
|
return u'<br>\n'.join(ret) |
return u'<br>\n'.join(ret) |
|
|
|
|
|
|
|
def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False): |
|
""" |
|
get texts with highlighted word from all ids in list FileIds. |
|
returns dict with id:text pairs. |
|
""" |
|
logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds)) |
|
return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds]) |
|
|
|
|
def URLquote(self,str): |
def URLquote(self,str): |
"""quote url""" |
"""quote url""" |
return urllib.quote(str) |
return urllib.quote(str) |