cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.2 and 1.9

version 1.7.2.2, 2007/10/19 16:25:07	version 1.9, 2008/09/25 12:37:55
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"
# for words	# for words
wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	wordBounds="_\|,\|\""
	wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"

class cdliSplitter:	class cdliSplitter:

"""base class for splitter.	"""base class for splitter.
the difference between word and grapheme splitter	the difference between word and grapheme splitter
is the word boundary list."""	is the word boundary list."""


default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


def process(self, lst):	def process(self, lst):
"""gets a list of strings and returns a list of words"""	"""gets a list of strings and returns a list of words"""

logging.debug("cdliSplitter")	logging.debug("cdliSplitter: %s"%self.indexName)
result = []	result = []
pNum=None	pNum=None
lineNum=None	lineNum=None
Line 62 class cdliSplitter:	Line 71 class cdliSplitter:
if s[0]=='&':	if s[0]=='&':
# store pNum	# store pNum
pNum=s[1:8]	pNum=s[1:8]
logging.debug("%s processing: %s"%(indexName,pNum))	logging.debug("%s processing: %s"%(self.indexName,pNum))

elif not (s[0] in ignoreLines):	elif not (s[0] in ignoreLines):
# regular line	# regular line
lineparts=s.split(".")	lineparts=s.split(". ",1)
if len(lineparts)==1:	if len(lineparts)==1:
# no line number	# no line number
txt=s	txt=s
Line 76 class cdliSplitter:	Line 85 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 86 class cdliSplitter:	Line 97 class cdliSplitter:
if not (w==''):	if not (w==''):
result.append(w)	result.append(w)

	#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result	return result


class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.2
changed lines
	Added in v.1.9