cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.6 and 1.9

version 1.7.2.6, 2007/12/03 21:30:19	version 1.9, 2008/09/25 12:37:55
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"
# for words	# for words
#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
wordBounds="<\|>\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""	wordBounds="_\|,\|\""
	wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"

class cdliSplitter:	class cdliSplitter:

"""base class for splitter.	"""base class for splitter.
the difference between word and grapheme splitter	the difference between word and grapheme splitter
is the word boundary list."""	is the word boundary list."""


default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


Line 68 class cdliSplitter:	Line 75 class cdliSplitter:

elif not (s[0] in ignoreLines):	elif not (s[0] in ignoreLines):
# regular line	# regular line
lineparts=s.split(".")	lineparts=s.split(". ",1)
if len(lineparts)==1:	if len(lineparts)==1:
# no line number	# no line number
txt=s	txt=s
Line 78 class cdliSplitter:	Line 85 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 88 class cdliSplitter:	Line 97 class cdliSplitter:
if not (w==''):	if not (w==''):
result.append(w)	result.append(w)

logging.debug("split '%s' into %s"%(lst,repr(result)))	#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result	return result


class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.6
changed lines
	Added in v.1.9