cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.4 and 1.7.2.9

version 1.7.2.4, 2007/10/24 20:36:06	version 1.7.2.9, 2007/12/13 19:20:45
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?"
# for words	# for words
wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	wordBounds="_\|,\|\""
	wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?"

class cdliSplitter:	class cdliSplitter:
"""base class for splitter.	"""base class for splitter.
Line 38 class cdliSplitter:	Line 42 class cdliSplitter:

default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


Line 76 class cdliSplitter:	Line 83 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 86 class cdliSplitter:	Line 95 class cdliSplitter:
if not (w==''):	if not (w==''):
result.append(w)	result.append(w)

logging.debug("split '%s' into %s"%(lst,repr(result)))	#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result	return result


class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.4
changed lines
	Added in v.1.7.2.9