version 1.7.2.7, 2007/12/11 17:27:36
|
version 1.8, 2008/01/21 17:19:01
|
Line 25 ignoreLines=['$','@','#','&','>']
|
Line 25 ignoreLines=['$','@','#','&','>']
|
separators=[''] |
separators=[''] |
# kommas relevant for graphemes will not be deleted |
# kommas relevant for graphemes will not be deleted |
komma_exception="([^sStThH])," |
komma_exception="([^sStThH])," |
|
komma_exceptionex=re.compile(komma_exception) |
# grapheme boundaries |
# grapheme boundaries |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
graphemeIgnore="" |
graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
# for words |
# for words |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
wordBounds="_|,|\"" |
wordBounds="_|,|\"" |
wordIgnore="<|>|\#|\||\]|\[|\!|\?" |
wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" |
|
|
class cdliSplitter: |
class cdliSplitter: |
"""base class for splitter. |
"""base class for splitter. |
Line 41 class cdliSplitter:
|
Line 42 class cdliSplitter:
|
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
bounds=graphemeBounds |
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
ignore=graphemeIgnore |
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
indexName="cdliSplitter" |
indexName="cdliSplitter" |
|
|
|
|
Line 70 class cdliSplitter:
|
Line 73 class cdliSplitter:
|
|
|
elif not (s[0] in ignoreLines): |
elif not (s[0] in ignoreLines): |
# regular line |
# regular line |
lineparts=s.split(".") |
lineparts=s.split(". ",1) |
if len(lineparts)==1: |
if len(lineparts)==1: |
# no line number |
# no line number |
txt=s |
txt=s |
Line 80 class cdliSplitter:
|
Line 83 class cdliSplitter:
|
lineNum=lineparts[0] |
lineNum=lineparts[0] |
|
|
# delete kommata except kommata relevant for graphemes |
# delete kommata except kommata relevant for graphemes |
txt = re.sub(komma_exception,r"\1",txt) |
txt = komma_exceptionex.sub(r"\1",txt) |
# replace letters to be ignored |
|
txt = re.sub(self.ignore,'',txt) |
|
# replace word boundaries by spaces |
# replace word boundaries by spaces |
txt = re.sub(self.bounds,' ',txt) |
txt = self.boundsex.sub(' ',txt) |
|
# replace letters to be ignored |
|
txt = self.ignorex.sub('',txt) |
# split words |
# split words |
words = txt.split(" ") |
words = txt.split(" ") |
for w in words: |
for w in words: |
Line 92 class cdliSplitter:
|
Line 95 class cdliSplitter:
|
if not (w==''): |
if not (w==''): |
result.append(w) |
result.append(w) |
|
|
logging.debug("split '%s' into %s"%(lst,repr(result))) |
#logging.debug("split '%s' into %s"%(lst,repr(result))) |
return result |
return result |
|
|
|
|
class graphemeSplitter(cdliSplitter): |
class graphemeSplitter(cdliSplitter): |
bounds=graphemeBounds |
bounds=graphemeBounds |
|
boundsex=re.compile(graphemeBounds) |
ignore=graphemeIgnore |
ignore=graphemeIgnore |
|
ignorex=re.compile(graphemeIgnore) |
indexName="graphemeSplitter" |
indexName="graphemeSplitter" |
|
|
class wordSplitter(cdliSplitter): |
class wordSplitter(cdliSplitter): |
bounds=wordBounds |
bounds=wordBounds |
|
boundsex=re.compile(wordBounds) |
ignore=wordIgnore |
ignore=wordIgnore |
|
ignorex=re.compile(wordIgnore) |
indexName="wordSplitter" |
indexName="wordSplitter" |
|
|
try: |
try: |