cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.8 and 1.7.2.9

version 1.7.2.8, 2007/12/11 17:33:07	version 1.7.2.9, 2007/12/13 19:20:45
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
graphemeIgnore=""	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?"
# for words	# for words
#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
wordBounds="_\|,\|\""	wordBounds="_\|,\|\""
Line 41 class cdliSplitter:	Line 42 class cdliSplitter:

default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
ignore=graphemeIgnore	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


Line 80 class cdliSplitter:	Line 83 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace letters to be ignored
txt = re.sub(self.ignore,'',txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 98 class cdliSplitter:	Line 101 class cdliSplitter:

class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
ignore=graphemeIgnore	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
ignore=wordIgnore	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.8
changed lines
	Added in v.1.7.2.9