--- cdli/cdliSplitter.py	2006/11/14 17:02:59	1.1
+++ cdli/cdliSplitter.py	2007/10/26 22:45:12	1.7.2.5
@@ -1,12 +1,11 @@
 """
-Author splitter
+CDLI word and grapheme splitter
 """
 
-from Products.ZCTextIndex.ISplitter import ISplitter
 from Products.ZCTextIndex.PipelineFactory import element_factory
 
 import re
-from types import StringType
+import logging
 
 def getSupportedEncoding(encodings):
     for encoding in encodings:
@@ -22,63 +21,94 @@ def getSupportedEncoding(encodings):
 """beta of a fulltext splitter for cdli
 
 """
-ignoreLines=['$','@','#','&']
+ignoreLines=['$','@','#','&','>']
 separators=['']
-delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
-
-class graphemeSplitter:
-
+# kommas relevant for graphemes will not be deleted
+komma_exception="([^sStThH])," 
+# grapheme boundaries
+graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+# for words 
+wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
+
+           
+class cdliSplitter:
+    """base class for splitter. 
+    the difference between word and grapheme splitter 
+    is the word boundary list."""
+    
     default_encoding = "utf-8"
-
+    bounds=graphemeBounds
+    indexName="cdliSplitter"
+    
+    
     def process(self, lst):
+        """gets a list of strings and returns a list of words"""
+        
+        logging.debug("cdliSplitter: %s"%self.indexName) 
         result = []
-       
+        pNum=None
+        lineNum=None
+    
         for t in lst:
+            # normalise line breaks
+            t.replace("\r","\n")
+            # split lines
+            for s in t.split("\n"):
+                if isinstance(s, str): 
+                    # not unicode
+                    s = unicode(s, self.default_encoding, 'replace')
+         
+                if (s!=''):
+                    if s[0]=='&': 
+                        # store pNum
+                        pNum=s[1:8]
+                        logging.debug("%s processing: %s"%(self.indexName,pNum))
+                        
+                    elif not (s[0] in ignoreLines):
+                        # regular line
+                        lineparts=s.split(".")
+                        if len(lineparts)==1: 
+                            # no line number
+                            txt=s
+                        else:
+                            #store line number
+                            txt=lineparts[1]
+                            lineNum=lineparts[0] 
+                            
+                        # delete kommata except kommata relevant for graphemes
+                        txt = re.sub(komma_exception,r"\1",txt)
+                        # replace word boundaries by spaces
+                        txt = re.sub(self.bounds,' ',txt)
+                        # split words
+                        words = txt.split(" ")
+                        for w in words:
+                            w=w.strip()
+                            if not (w==''):
+                                result.append(w)
 
-         t.replace("\r","\n")
-         for s in t.split("\n"):
-       
-            if type(s) is StringType: # not unicode
-                s = unicode(s, self.default_encoding, 'replace')
-            
-            #ignore lines
-            
-            if (s!="") and (not (s[0] in ignoreLines)):
-              
-                #ignore everthing bevor "."
-                splitted=s.split(".")
-                
-                if len(splitted)==1: #kein punkt
-                    txt=splitted[0]
-                else:
-                    txt=splitted[1]
-                
-                analyse=txt
-    
-                analyse=re.sub(delete,' ',analyse) # deletions
-                
-                splitted = analyse.split(" ")
-               
-                for w in splitted:
-                    w=w.lstrip().rstrip()
-                    if not (w==''):
-                        print repr(w)
-                        result.append(w.lstrip().rstrip())
+        #logging.debug("split '%s' into %s"%(lst,repr(result)))
         return result
 
-element_factory.registerFactory('Word Splitter',
-          'CDLI grapheme splitter', graphemeSplitter)
- 
+
+class graphemeSplitter(cdliSplitter):
+    bounds=graphemeBounds
+    indexName="graphemeSplitter"
+    
+class wordSplitter(cdliSplitter):
+    bounds=wordBounds
+    indexName="wordSplitter"
+      
 try:
-    element_factory.registerFactory('graphemeSplitter',
+    element_factory.registerFactory('Word Splitter',
           'CDLI grapheme splitter', graphemeSplitter)
 except:
     # in case the splitter is already registered, ValueError is raised
     pass
 
-if __name__ == '__main__':
-   a = 'abc def我们的很 好。'
-   u = unicode(a, 'gbk')
-   s = authorSplitter()
-   print s.process([u])
-   print s.process([u], 1)
+try:
+    element_factory.registerFactory('Word Splitter',
+          'CDLI word splitter', wordSplitter)
+except:
+    # in case the splitter is already registered, ValueError is raised
+    pass
+