version 1.7.2.2, 2007/10/19 16:25:07
|
version 1.7.2.8, 2007/12/11 17:33:07
|
Line 26 separators=['']
|
Line 26 separators=['']
|
# kommas relevant for graphemes will not be deleted |
# kommas relevant for graphemes will not be deleted |
komma_exception="([^sStThH])," |
komma_exception="([^sStThH])," |
# grapheme boundaries |
# grapheme boundaries |
graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
|
graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
|
graphemeIgnore="" |
# for words |
# for words |
wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
|
wordBounds="_|,|\"" |
|
wordIgnore="<|>|\#|\||\]|\[|\!|\?" |
|
|
class cdliSplitter: |
class cdliSplitter: |
"""base class for splitter. |
"""base class for splitter. |
Line 38 class cdliSplitter:
|
Line 41 class cdliSplitter:
|
|
|
default_encoding = "utf-8" |
default_encoding = "utf-8" |
bounds=graphemeBounds |
bounds=graphemeBounds |
|
ignore=graphemeIgnore |
indexName="cdliSplitter" |
indexName="cdliSplitter" |
|
|
|
|
def process(self, lst): |
def process(self, lst): |
"""gets a list of strings and returns a list of words""" |
"""gets a list of strings and returns a list of words""" |
|
|
logging.debug("cdliSplitter") |
logging.debug("cdliSplitter: %s"%self.indexName) |
result = [] |
result = [] |
pNum=None |
pNum=None |
lineNum=None |
lineNum=None |
Line 62 class cdliSplitter:
|
Line 66 class cdliSplitter:
|
if s[0]=='&': |
if s[0]=='&': |
# store pNum |
# store pNum |
pNum=s[1:8] |
pNum=s[1:8] |
logging.debug("%s processing: %s"%(indexName,pNum)) |
logging.debug("%s processing: %s"%(self.indexName,pNum)) |
|
|
elif not (s[0] in ignoreLines): |
elif not (s[0] in ignoreLines): |
# regular line |
# regular line |
Line 77 class cdliSplitter:
|
Line 81 class cdliSplitter:
|
|
|
# delete kommata except kommata relevant for graphemes |
# delete kommata except kommata relevant for graphemes |
txt = re.sub(komma_exception,r"\1",txt) |
txt = re.sub(komma_exception,r"\1",txt) |
|
# replace letters to be ignored |
|
txt = re.sub(self.ignore,'',txt) |
# replace word boundaries by spaces |
# replace word boundaries by spaces |
txt = re.sub(self.bounds,' ',txt) |
txt = re.sub(self.bounds,' ',txt) |
# split words |
# split words |
Line 86 class cdliSplitter:
|
Line 92 class cdliSplitter:
|
if not (w==''): |
if not (w==''): |
result.append(w) |
result.append(w) |
|
|
|
#logging.debug("split '%s' into %s"%(lst,repr(result))) |
return result |
return result |
|
|
|
|
class graphemeSplitter(cdliSplitter): |
class graphemeSplitter(cdliSplitter): |
bounds=graphemeBounds |
bounds=graphemeBounds |
|
ignore=graphemeIgnore |
indexName="graphemeSplitter" |
indexName="graphemeSplitter" |
|
|
class wordSplitter(cdliSplitter): |
class wordSplitter(cdliSplitter): |
bounds=wordBounds |
bounds=wordBounds |
|
ignore=wordIgnore |
indexName="wordSplitter" |
indexName="wordSplitter" |
|
|
try: |
try: |