cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.7.2.2 and 1.7.2.11

version 1.7.2.2, 2007/10/19 16:25:07	version 1.7.2.11, 2008/01/09 18:49:07
Line 25 ignoreLines=['$','@','#','&','>']	Line 25 ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
# kommas relevant for graphemes will not be deleted	# kommas relevant for graphemes will not be deleted
komma_exception="([^sStThH]),"	komma_exception="([^sStThH]),"
	komma_exceptionex=re.compile(komma_exception)
# grapheme boundaries	# grapheme boundaries
graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#graphemeBounds="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	graphemeBounds="\{\|\}\|<\|>\|-\|_\|\#\|,\|\]\|\[\|\!\|\?\|\""
	graphemeIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"
# for words	# for words
wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"	#wordBounds="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"
	wordBounds="_\|,\|\""
	wordIgnore="<\|>\|\#\|\\|\|\]\|\[\|\!\|\?\*\|;"

class cdliSplitter:	class cdliSplitter:
"""base class for splitter.	"""base class for splitter.
Line 38 class cdliSplitter:	Line 42 class cdliSplitter:

default_encoding = "utf-8"	default_encoding = "utf-8"
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="cdliSplitter"	indexName="cdliSplitter"


def process(self, lst):	def process(self, lst):
"""gets a list of strings and returns a list of words"""	"""gets a list of strings and returns a list of words"""

logging.debug("cdliSplitter")	logging.debug("cdliSplitter: %s"%self.indexName)
result = []	result = []
pNum=None	pNum=None
lineNum=None	lineNum=None
Line 62 class cdliSplitter:	Line 69 class cdliSplitter:
if s[0]=='&':	if s[0]=='&':
# store pNum	# store pNum
pNum=s[1:8]	pNum=s[1:8]
logging.debug("%s processing: %s"%(indexName,pNum))	logging.debug("%s processing: %s"%(self.indexName,pNum))

elif not (s[0] in ignoreLines):	elif not (s[0] in ignoreLines):
# regular line	# regular line
Line 76 class cdliSplitter:	Line 83 class cdliSplitter:
lineNum=lineparts[0]	lineNum=lineparts[0]

# delete kommata except kommata relevant for graphemes	# delete kommata except kommata relevant for graphemes
txt = re.sub(komma_exception,r"\1",txt)	txt = komma_exceptionex.sub(r"\1",txt)
# replace word boundaries by spaces	# replace word boundaries by spaces
txt = re.sub(self.bounds,' ',txt)	txt = self.boundsex.sub(' ',txt)
	# replace letters to be ignored
	txt = self.ignorex.sub('',txt)
# split words	# split words
words = txt.split(" ")	words = txt.split(" ")
for w in words:	for w in words:
Line 86 class cdliSplitter:	Line 95 class cdliSplitter:
if not (w==''):	if not (w==''):
result.append(w)	result.append(w)

	#logging.debug("split '%s' into %s"%(lst,repr(result)))
return result	return result


class graphemeSplitter(cdliSplitter):	class graphemeSplitter(cdliSplitter):
bounds=graphemeBounds	bounds=graphemeBounds
	boundsex=re.compile(graphemeBounds)
	ignore=graphemeIgnore
	ignorex=re.compile(graphemeIgnore)
indexName="graphemeSplitter"	indexName="graphemeSplitter"

class wordSplitter(cdliSplitter):	class wordSplitter(cdliSplitter):
bounds=wordBounds	bounds=wordBounds
	boundsex=re.compile(wordBounds)
	ignore=wordIgnore
	ignorex=re.compile(wordIgnore)
indexName="wordSplitter"	indexName="wordSplitter"

try:	try:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.7.2.2
changed lines
	Added in v.1.7.2.11