--- cdli/cdliSplitter.py 2007/10/19 16:25:07 1.7.2.2 +++ cdli/cdliSplitter.py 2007/12/11 17:27:36 1.7.2.7 @@ -26,10 +26,13 @@ separators=[''] # kommas relevant for graphemes will not be deleted komma_exception="([^sStThH])," # grapheme boundaries -graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" +graphemeIgnore="" # for words -wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" - +#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" +wordBounds="_|,|\"" +wordIgnore="<|>|\#|\||\]|\[|\!|\?" class cdliSplitter: """base class for splitter. @@ -38,13 +41,14 @@ class cdliSplitter: default_encoding = "utf-8" bounds=graphemeBounds + ignore=graphemeIgnore indexName="cdliSplitter" def process(self, lst): """gets a list of strings and returns a list of words""" - logging.debug("cdliSplitter") + logging.debug("cdliSplitter: %s"%self.indexName) result = [] pNum=None lineNum=None @@ -62,7 +66,7 @@ class cdliSplitter: if s[0]=='&': # store pNum pNum=s[1:8] - logging.debug("%s processing: %s"%(indexName,pNum)) + logging.debug("%s processing: %s"%(self.indexName,pNum)) elif not (s[0] in ignoreLines): # regular line @@ -77,6 +81,8 @@ class cdliSplitter: # delete kommata except kommata relevant for graphemes txt = re.sub(komma_exception,r"\1",txt) + # replace letters to be ignored + txt = re.sub(self.ignore,'',txt) # replace word boundaries by spaces txt = re.sub(self.bounds,' ',txt) # split words @@ -86,15 +92,18 @@ class cdliSplitter: if not (w==''): result.append(w) + logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): bounds=graphemeBounds + ignore=graphemeIgnore indexName="graphemeSplitter" class wordSplitter(cdliSplitter): bounds=wordBounds + ignore=wordIgnore indexName="wordSplitter" try: