cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.7 and 1.8

version 1.7.2.7, 2007/12/11 17:27:36	version 1.8, 2008/01/21 17:19:01
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
graphemeIgnore=""	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"
# for words	# for words
#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
wordBounds="_\|,\|\""	wordBounds="_\|,\|\""
wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?"	wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"

class cdliSplitter:	class cdliSplitter:
"""base class for splitter.	"""base class for splitter.
Line 41 class cdliSplitter:	Line 42 class cdliSplitter:

default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
ignore=graphemeIgnore	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


Line 70 class cdliSplitter:	Line 73 class cdliSplitter:

elif not (s[0] in ignoreLines):	elif not (s[0] in ignoreLines):
# regular line	# regular line
lineparts=s.split(".")	lineparts=s.split(". ",1)
if len(lineparts)==1:	if len(lineparts)==1:
# no line number	# no line number
txt=s	txt=s
Line 80 class cdliSplitter:	Line 83 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace letters to be ignored
txt = re.sub(self.ignore,'',txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 92 class cdliSplitter:	Line 95 class cdliSplitter:
if not (w==''):	if not (w==''):
result.append(w)	result.append(w)

logging.debug("split '%s' into %s"%(lst,repr(result)))	#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result	return result


class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
ignore=graphemeIgnore	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
ignore=wordIgnore	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.7
changed lines
	Added in v.1.8