--- cdli/cdliSplitter.py 2007/12/11 17:27:36 1.7.2.7 +++ cdli/cdliSplitter.py 2008/01/09 18:49:07 1.7.2.11 @@ -24,15 +24,16 @@ def getSupportedEncoding(encodings): ignoreLines=['$','@','#','&','>'] separators=[''] # kommas relevant for graphemes will not be deleted -komma_exception="([^sStThH])," +komma_exception="([^sStThH])," +komma_exceptionex=re.compile(komma_exception) # grapheme boundaries #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" -graphemeIgnore="" +graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" # for words #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" wordBounds="_|,|\"" -wordIgnore="<|>|\#|\||\]|\[|\!|\?" +wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" class cdliSplitter: """base class for splitter. @@ -41,7 +42,9 @@ class cdliSplitter: default_encoding = "utf-8" bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) indexName="cdliSplitter" @@ -80,11 +83,11 @@ class cdliSplitter: lineNum=lineparts[0] # delete kommata except kommata relevant for graphemes - txt = re.sub(komma_exception,r"\1",txt) - # replace letters to be ignored - txt = re.sub(self.ignore,'',txt) + txt = komma_exceptionex.sub(r"\1",txt) # replace word boundaries by spaces - txt = re.sub(self.bounds,' ',txt) + txt = self.boundsex.sub(' ',txt) + # replace letters to be ignored + txt = self.ignorex.sub('',txt) # split words words = txt.split(" ") for w in words: @@ -92,18 +95,22 @@ class cdliSplitter: if not (w==''): result.append(w) - logging.debug("split '%s' into %s"%(lst,repr(result))) + #logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) indexName="graphemeSplitter" class wordSplitter(cdliSplitter): bounds=wordBounds + boundsex=re.compile(wordBounds) ignore=wordIgnore + ignorex=re.compile(wordIgnore) indexName="wordSplitter" try: