cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.1 and 1.2

version 1.1, 2006/11/14 17:02:59	version 1.2, 2006/12/22 11:56:08
Line 2	Line 2
Author splitter	Author splitter
"""	"""

	import Zope

from Products.ZCTextIndex.ISplitter import ISplitter	from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory	from Products.ZCTextIndex.PipelineFactory import element_factory

Line 24 def getSupportedEncoding(encodings):	Line 26 def getSupportedEncoding(encodings):
"""	"""
ignoreLines=['$','@','#','&']	ignoreLines=['$','@','#','&']
separators=['']	separators=['']
delete="{\|}\|<\|>\|$\|$\|-\|_\|\#\|,\|\~\|\\|\|\]\|\["	komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
	delete="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?" # for graphems
	#delete="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|!\|?" for words

class graphemeSplitter:	class graphemeSplitter:

Line 32 class graphemeSplitter:	Line 36 class graphemeSplitter:

def process(self, lst):	def process(self, lst):
result = []	result = []
	pNum=None
	lineNum=None


	#print "LLLL",lst


for t in lst:	for t in lst:

Line 43 class graphemeSplitter:	Line 53 class graphemeSplitter:

#ignore lines	#ignore lines

if (s!="") and (not (s[0] in ignoreLines)):	if (s!="") and (s[0]=="&"): # store pNum
	pNum=s[1:8]

	elif (s!="") and (not (s[0] in ignoreLines)):


#ignore everthing bevor "."	#ignore everthing bevor "."
splitted=s.split(".")	splitted=s.split(".")
Line 52 class graphemeSplitter:	Line 66 class graphemeSplitter:
txt=splitted[0]	txt=splitted[0]
else:	else:
txt=splitted[1]	txt=splitted[1]
	lineNum=splitted[0] #store line number

analyse=txt	analyse=txt

	analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems

analyse=re.sub(delete,' ',analyse) # deletions	analyse=re.sub(delete,' ',analyse) # deletions

splitted = analyse.split(" ")	splitted = analyse.split(" ")

for w in splitted:	for w in splitted:
w=w.lstrip().rstrip()	w=w.lstrip().rstrip()

if not (w==''):	if not (w==''):
print repr(w)	if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
	Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
	get_transaction().commit()

result.append(w.lstrip().rstrip())	result.append(w.lstrip().rstrip())
return result	return result

element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)

try:	try:
element_factory.registerFactory('graphemeSplitter',	element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)	'CDLI grapheme splitter', graphemeSplitter)
except:	except:
# in case the splitter is already registered, ValueError is raised	# in case the splitter is already registered, ValueError is raised

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1
changed lines
	Added in v.1.2