version 1.7.2.12, 2008/01/14 17:35:26
|
version 1.9, 2008/09/25 12:37:55
|
Line 36 wordBounds="_|,|\""
|
Line 36 wordBounds="_|,|\""
|
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
|
|
class cdliSplitter: |
class cdliSplitter: |
|
|
"""base class for splitter. |
"""base class for splitter. |
the difference between word and grapheme splitter |
the difference between word and grapheme splitter |
is the word boundary list.""" |
is the word boundary list.""" |
|
|
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
bounds=graphemeBounds |
bounds=graphemeBounds |
boundsex=re.compile(graphemeBounds) |
boundsex=re.compile(graphemeBounds) |
Line 95 class cdliSplitter:
|
Line 97 class cdliSplitter:
|
if not (w==''): |
if not (w==''): |
result.append(w) |
result.append(w) |
|
|
logging.debug("split '%s' into %s"%(lst,repr(result))) |
#logging.debug("split '%s' into %s"%(lst,repr(result))) |
return result |
return result |
|
|
|
|