--- cdli/cdliSplitter.py 2007/12/03 21:30:19 1.7.2.6 +++ cdli/cdliSplitter.py 2007/12/11 17:33:07 1.7.2.8 @@ -28,10 +28,11 @@ komma_exception="([^sStThH])," # grapheme boundaries #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" +graphemeIgnore="" # for words #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" -wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\"" - +wordBounds="_|,|\"" +wordIgnore="<|>|\#|\||\]|\[|\!|\?" class cdliSplitter: """base class for splitter. @@ -40,6 +41,7 @@ class cdliSplitter: default_encoding = "utf-8" bounds=graphemeBounds + ignore=graphemeIgnore indexName="cdliSplitter" @@ -79,6 +81,8 @@ class cdliSplitter: # delete kommata except kommata relevant for graphemes txt = re.sub(komma_exception,r"\1",txt) + # replace letters to be ignored + txt = re.sub(self.ignore,'',txt) # replace word boundaries by spaces txt = re.sub(self.bounds,' ',txt) # split words @@ -88,16 +92,18 @@ class cdliSplitter: if not (w==''): result.append(w) - logging.debug("split '%s' into %s"%(lst,repr(result))) + #logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): bounds=graphemeBounds + ignore=graphemeIgnore indexName="graphemeSplitter" class wordSplitter(cdliSplitter): bounds=wordBounds + ignore=wordIgnore indexName="wordSplitter" try: