cdli/cdli_files.py - diff

Return to cdli_files.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdli_files.py between versions 1.80.2.7 and 1.80.2.11

version 1.80.2.7, 2007/11/19 15:14:44	version 1.80.2.11, 2007/12/13 19:20:45
Line 2137 class CDLIRoot(Folder):	Line 2137 class CDLIRoot(Folder):
resultset = idx.search(query_request=idxQuery,sort_index='textid')	resultset = idx.search(query_request=idxQuery,sort_index='textid')
# put only the P-Number in the result	# put only the P-Number in the result
results = [res.getId[:7] for res in resultset]	results = [res.getId[:7] for res in resultset]
	logging.debug("searchtext: found %d texts"%len(results))
return results	return results


Line 2167 class CDLIRoot(Folder):	Line 2168 class CDLIRoot(Folder):

def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):	def showWordInFile(self,fileId,word,indexName='graphemes',regExp=False,):
"""get lines with word from FileId"""	"""get lines with word from FileId"""
	logging.debug("showwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId))

file = formatAtfFullLineNum(self.getFile(fileId))	file = formatAtfFullLineNum(self.getFile(fileId))
ret=[]	ret=[]
Line 2174 class CDLIRoot(Folder):	Line 2176 class CDLIRoot(Folder):
# add whitespace before and whitespace and line-end to splitter bounds expressions	# add whitespace before and whitespace and line-end to splitter bounds expressions
bounds = self.splitter[indexName].bounds	bounds = self.splitter[indexName].bounds
splitexp = "(%s\|\s)(%%s)(%s\|\s\|\Z)"%(bounds,bounds)	splitexp = "(%s\|\s)(%%s)(%s\|\s\|\Z)"%(bounds,bounds)
# compile into regexp objects	# clean word expression
wordlist = [re.compile(splitexp%w) for w in word.split(' ')]	# TODO: this should use QueryParser itself
	# take out double quotes
	word = word.replace('"','')
	# take out ignorable signs
	ignorable = self.splitter[indexName].ignorex
	word = ignorable.sub('', word)
	# compile into regexp objects and escape parens
	wordlist = [re.compile(splitexp%re.escape(w)) for w in word.split(' ')]

for line in file.split("\n"):	for line in file.split("\n"):
for word in wordlist:	for word in wordlist:
#logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,line))	#logging.debug("showwordinfile: searching for %s in %s"%(word.pattern,ignoreable.sub('',line)))
if word.search(line):	if word.search(ignorable.sub('',line)):
line = formatAtfLineHtml(line)	line = formatAtfLineHtml(line)
ret.append(line)	ret.append(line)
break	break
Line 2188 class CDLIRoot(Folder):	Line 2197 class CDLIRoot(Folder):
return ret	return ret


	def showWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
	"""
	get lines with word from all ids in list FileIds.
	returns dict with id:lines pairs.
	"""
	logging.debug("showwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))

	return dict([(id,self.showWordInFile(id, word, indexName, regExp)) for id in fileIds])


def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):	def tagWordInFile(self,fileId,word,indexName='graphemes',regExp=False):
"""get text with word highlighted from FileId"""	"""get text with word highlighted from FileId"""
	logging.debug("tagwordinfile word='%s' index=%s file=%s"%(word,indexName,fileId))

file=self.getFile(fileId)	file=self.getFile(fileId)
tagStart=u'<span class="found">'	tagStart=u'<span class="found">'
Line 2200 class CDLIRoot(Folder):	Line 2220 class CDLIRoot(Folder):
# add whitespace to splitter bounds expressions and compile into regexp object	# add whitespace to splitter bounds expressions and compile into regexp object
bounds = self.splitter[indexName].bounds	bounds = self.splitter[indexName].bounds
wordsplit = re.compile("(%s\|\s)"%bounds)	wordsplit = re.compile("(%s\|\s)"%bounds)
	# clean word expression
	# TODO: this should use QueryParser itself
	word = word.replace('"','') # take out double quotes
	# take out ignoreable signs
	ignorable = self.splitter[indexName].ignorex
	word = ignorable.sub('', word)
# split search terms by blanks	# split search terms by blanks
words = word.split(' ')	words = word.split(' ')
	# split search terms again (for grapheme search with words)
	splitwords = dict(((w,self.splitter[indexName].process([w])) for w in words))

for line in file.split("\n"):	for line in file.split("\n"):
line = unicodify(line)	line = unicodify(line)
Line 2212 class CDLIRoot(Folder):	Line 2240 class CDLIRoot(Folder):
# first scan	# first scan
hitwords = []	hitwords = []
for w in words:	for w in words:
if line.find(w) > -1:	if ignorable.sub('',line).find(w) > -1:
# word is in line	# word is in line
hitwords.append(w)	# append split word for grapheme search with words
	hitwords.extend(splitwords[w])
	#hitwords.extend(wordsplit.split(w))

# examine hits closer	# examine hits closer
if hitwords:	if hitwords:
Line 2222 class CDLIRoot(Folder):	Line 2252 class CDLIRoot(Folder):
parts = wordsplit.split(line)	parts = wordsplit.split(line)
line = ""	line = ""
for p in parts:	for p in parts:
	#logging.debug("tagwordinfile: searching for %s in %s"%(p,hitwords))
# reassemble line	# reassemble line
if p in hitwords:	if ignorable.sub('', p) in hitwords:
	#logging.debug("tagwordinfile: found %s in %s"%(p,hitwords))
# this part was found	# this part was found
line += tagStart + formatAtfHtml(p) + tagEnd	line += tagStart + formatAtfHtml(p) + tagEnd
else:	else:
Line 2238 class CDLIRoot(Folder):	Line 2270 class CDLIRoot(Folder):
return u'<br>\n'.join(ret)	return u'<br>\n'.join(ret)



	def tagWordInFiles(self,fileIds,word,indexName='graphemes',regExp=False):
	"""
	get texts with highlighted word from all ids in list FileIds.
	returns dict with id:text pairs.
	"""
	logging.debug("tagwordinfiles word='%s' index=%s file=%s"%(word,indexName,fileIds))
	return dict([(id,self.tagWordInFile(id, word, indexName, regExp)) for id in fileIds])


def URLquote(self,str):	def URLquote(self,str):
"""quote url"""	"""quote url"""
return urllib.quote(str)	return urllib.quote(str)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.80.2.7
changed lines
	Added in v.1.80.2.11