--- cdli/cdliSplitter.py	2007/12/03 21:30:19	1.7.2.6
+++ cdli/cdliSplitter.py	2007/12/11 17:27:36	1.7.2.7
@@ -28,10 +28,11 @@ komma_exception="([^sStThH]),"
 # grapheme boundaries
 #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
 graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+graphemeIgnore=""
 # for words 
 #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
-wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\""
-
+wordBounds="_|,|\""
+wordIgnore="<|>|\#|\||\]|\[|\!|\?"
            
 class cdliSplitter:
     """base class for splitter. 
@@ -40,6 +41,7 @@ class cdliSplitter:
     
     default_encoding = "utf-8"
     bounds=graphemeBounds
+    ignore=graphemeIgnore
     indexName="cdliSplitter"
     
     
@@ -79,6 +81,8 @@ class cdliSplitter:
                             
                         # delete kommata except kommata relevant for graphemes
                         txt = re.sub(komma_exception,r"\1",txt)
+                        # replace letters to be ignored
+                        txt = re.sub(self.ignore,'',txt)
                         # replace word boundaries by spaces
                         txt = re.sub(self.bounds,' ',txt)
                         # split words
@@ -94,10 +98,12 @@ class cdliSplitter:
 
 class graphemeSplitter(cdliSplitter):
     bounds=graphemeBounds
+    ignore=graphemeIgnore
     indexName="graphemeSplitter"
     
 class wordSplitter(cdliSplitter):
     bounds=wordBounds
+    ignore=wordIgnore
     indexName="wordSplitter"
       
 try: