version 1.7.2.10, 2008/01/02 15:52:01
|
version 1.9, 2008/09/25 12:37:55
|
Line 29 komma_exceptionex=re.compile(komma_excep
|
Line 29 komma_exceptionex=re.compile(komma_excep
|
# grapheme boundaries |
# grapheme boundaries |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*" |
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
# for words |
# for words |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
wordBounds="_|,|\"" |
wordBounds="_|,|\"" |
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*" |
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
|
|
class cdliSplitter: |
class cdliSplitter: |
|
|
"""base class for splitter. |
"""base class for splitter. |
the difference between word and grapheme splitter |
the difference between word and grapheme splitter |
is the word boundary list.""" |
is the word boundary list.""" |
|
|
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
bounds=graphemeBounds |
bounds=graphemeBounds |
boundsex=re.compile(graphemeBounds) |
boundsex=re.compile(graphemeBounds) |
Line 73 class cdliSplitter:
|
Line 75 class cdliSplitter:
|
|
|
elif not (s[0] in ignoreLines): |
elif not (s[0] in ignoreLines): |
# regular line |
# regular line |
lineparts=s.split(".") |
lineparts=s.split(". ",1) |
if len(lineparts)==1: |
if len(lineparts)==1: |
# no line number |
# no line number |
txt=s |
txt=s |