--- cdli/cdliSplitter.py	2007/10/26 22:45:12	1.7.2.5
+++ cdli/cdliSplitter.py	2008/09/25 12:37:55	1.9
@@ -24,20 +24,29 @@ def getSupportedEncoding(encodings):
 ignoreLines=['$','@','#','&','>']
 separators=['']
 # kommas relevant for graphemes will not be deleted
-komma_exception="([^sStThH])," 
+komma_exception="([^sStThH]),"
+komma_exceptionex=re.compile(komma_exception)
 # grapheme boundaries
-graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
 # for words 
-wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
-
+#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
+wordBounds="_|,|\""
+wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
            
 class cdliSplitter:
+
     """base class for splitter. 
     the difference between word and grapheme splitter 
     is the word boundary list."""
+
     
     default_encoding = "utf-8"
     bounds=graphemeBounds
+    boundsex=re.compile(graphemeBounds)
+    ignore=graphemeIgnore
+    ignorex=re.compile(graphemeIgnore)
     indexName="cdliSplitter"
     
     
@@ -66,7 +75,7 @@ class cdliSplitter:
                         
                     elif not (s[0] in ignoreLines):
                         # regular line
-                        lineparts=s.split(".")
+                        lineparts=s.split(". ",1)
                         if len(lineparts)==1: 
                             # no line number
                             txt=s
@@ -76,9 +85,11 @@ class cdliSplitter:
                             lineNum=lineparts[0] 
                             
                         # delete kommata except kommata relevant for graphemes
-                        txt = re.sub(komma_exception,r"\1",txt)
+                        txt = komma_exceptionex.sub(r"\1",txt)
                         # replace word boundaries by spaces
-                        txt = re.sub(self.bounds,' ',txt)
+                        txt = self.boundsex.sub(' ',txt)
+                        # replace letters to be ignored
+                        txt = self.ignorex.sub('',txt)
                         # split words
                         words = txt.split(" ")
                         for w in words:
@@ -92,10 +103,16 @@ class cdliSplitter:
 
 class graphemeSplitter(cdliSplitter):
     bounds=graphemeBounds
+    boundsex=re.compile(graphemeBounds)
+    ignore=graphemeIgnore
+    ignorex=re.compile(graphemeIgnore)
     indexName="graphemeSplitter"
     
 class wordSplitter(cdliSplitter):
     bounds=wordBounds
+    boundsex=re.compile(wordBounds)
+    ignore=wordIgnore
+    ignorex=re.compile(wordIgnore)
     indexName="wordSplitter"
       
 try: