--- cdli/cdliSplitter.py 2007/10/19 16:25:07 1.7.2.2 +++ cdli/cdliSplitter.py 2008/09/25 12:37:55 1.9 @@ -24,27 +24,36 @@ def getSupportedEncoding(encodings): ignoreLines=['$','@','#','&','>'] separators=[''] # kommas relevant for graphemes will not be deleted -komma_exception="([^sStThH])," +komma_exception="([^sStThH])," +komma_exceptionex=re.compile(komma_exception) # grapheme boundaries -graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" +graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" # for words -wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" - +#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" +wordBounds="_|,|\"" +wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" class cdliSplitter: + """base class for splitter. the difference between word and grapheme splitter is the word boundary list.""" + default_encoding = "utf-8" bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) + ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) indexName="cdliSplitter" def process(self, lst): """gets a list of strings and returns a list of words""" - logging.debug("cdliSplitter") + logging.debug("cdliSplitter: %s"%self.indexName) result = [] pNum=None lineNum=None @@ -62,11 +71,11 @@ class cdliSplitter: if s[0]=='&': # store pNum pNum=s[1:8] - logging.debug("%s processing: %s"%(indexName,pNum)) + logging.debug("%s processing: %s"%(self.indexName,pNum)) elif not (s[0] in ignoreLines): # regular line - lineparts=s.split(".") + lineparts=s.split(". ",1) if len(lineparts)==1: # no line number txt=s @@ -76,9 +85,11 @@ class cdliSplitter: lineNum=lineparts[0] # delete kommata except kommata relevant for graphemes - txt = re.sub(komma_exception,r"\1",txt) + txt = komma_exceptionex.sub(r"\1",txt) # replace word boundaries by spaces - txt = re.sub(self.bounds,' ',txt) + txt = self.boundsex.sub(' ',txt) + # replace letters to be ignored + txt = self.ignorex.sub('',txt) # split words words = txt.split(" ") for w in words: @@ -86,15 +97,22 @@ class cdliSplitter: if not (w==''): result.append(w) + #logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) + ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) indexName="graphemeSplitter" class wordSplitter(cdliSplitter): bounds=wordBounds + boundsex=re.compile(wordBounds) + ignore=wordIgnore + ignorex=re.compile(wordIgnore) indexName="wordSplitter" try: