--- cdli/cdliSplitter.py 2008/01/02 15:52:01 1.7.2.10 +++ cdli/cdliSplitter.py 2008/09/25 12:37:55 1.9 @@ -29,16 +29,18 @@ komma_exceptionex=re.compile(komma_excep # grapheme boundaries #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" -graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*" +graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" # for words #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" wordBounds="_|,|\"" -wordIgnore="<|>|\#|\||\]|\[|\!|\?\*" +wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" class cdliSplitter: + """base class for splitter. the difference between word and grapheme splitter is the word boundary list.""" + default_encoding = "utf-8" bounds=graphemeBounds @@ -73,7 +75,7 @@ class cdliSplitter: elif not (s[0] in ignoreLines): # regular line - lineparts=s.split(".") + lineparts=s.split(". ",1) if len(lineparts)==1: # no line number txt=s