--- cdli/cdliSplitter.py	2007/02/08 12:00:23	1.4
+++ cdli/cdliSplitter.py	2008/01/09 18:49:07	1.7.2.11
@@ -1,15 +1,11 @@
 """
-Author splitter
+CDLI word and grapheme splitter
 """
 
-import Zope2
-import transaction
-
-from Products.ZCTextIndex.ISplitter import ISplitter
 from Products.ZCTextIndex.PipelineFactory import element_factory
 
 import re
-from types import StringType
+import logging
 
 def getSupportedEncoding(encodings):
     for encoding in encodings:
@@ -25,70 +21,98 @@ def getSupportedEncoding(encodings):
 """beta of a fulltext splitter for cdli
 
 """
-ignoreLines=['$','@','#','&']
+ignoreLines=['$','@','#','&','>']
 separators=['']
-komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
-delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
-#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
-
-class graphemeSplitter:
-
+# kommas relevant for graphemes will not be deleted
+komma_exception="([^sStThH]),"
+komma_exceptionex=re.compile(komma_exception)
+# grapheme boundaries
+#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
+# for words 
+#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
+wordBounds="_|,|\""
+wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
+           
+class cdliSplitter:
+    """base class for splitter. 
+    the difference between word and grapheme splitter 
+    is the word boundary list."""
+    
     default_encoding = "utf-8"
+    bounds=graphemeBounds
+    boundsex=re.compile(graphemeBounds)
+    ignore=graphemeIgnore
+    ignorex=re.compile(graphemeIgnore)
+    indexName="cdliSplitter"
+    
     
     def process(self, lst):
+        """gets a list of strings and returns a list of words"""
+        
+        logging.debug("cdliSplitter: %s"%self.indexName) 
         result = []
         pNum=None
         lineNum=None
-       
-  
-        #print "LLLL",lst
-        
-      
+    
         for t in lst:
-      
-         t.replace("\r","\n")
-         for s in t.split("\n"):
-      
-            if type(s) is StringType: # not unicode
-                s = unicode(s, self.default_encoding, 'replace')
-            
-            #ignore lines
-
-            if (s!="") and (s[0]=="&"): # store pNum
-                pNum=s[1:8]
-
-            elif (s!="") and (not (s[0] in ignoreLines)):
-
-              
-                #ignore everthing bevor "."
-                splitted=s.split(".")
-               
-                if len(splitted)==1: #kein punkt
-                    txt=splitted[0]
-                else:
-                    txt=splitted[1]
-                    lineNum=splitted[0] #store line number
-                
-                analyse=txt
-                
-                analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
-
-                analyse=re.sub(delete,' ',analyse) # deletions
-
-                splitted = analyse.split(" ")
-               
-                for w in splitted:
-                    w=w.lstrip().rstrip()
-
-                    if not (w==''):
-                        if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
-                            Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
-                            transaction.get().commit()
+            # normalise line breaks
+            t.replace("\r","\n")
+            # split lines
+            for s in t.split("\n"):
+                if isinstance(s, str): 
+                    # not unicode
+                    s = unicode(s, self.default_encoding, 'replace')
+         
+                if (s!=''):
+                    if s[0]=='&': 
+                        # store pNum
+                        pNum=s[1:8]
+                        logging.debug("%s processing: %s"%(self.indexName,pNum))
+                        
+                    elif not (s[0] in ignoreLines):
+                        # regular line
+                        lineparts=s.split(".")
+                        if len(lineparts)==1: 
+                            # no line number
+                            txt=s
+                        else:
+                            #store line number
+                            txt=lineparts[1]
+                            lineNum=lineparts[0] 
+                            
+                        # delete kommata except kommata relevant for graphemes
+                        txt = komma_exceptionex.sub(r"\1",txt)
+                        # replace word boundaries by spaces
+                        txt = self.boundsex.sub(' ',txt)
+                        # replace letters to be ignored
+                        txt = self.ignorex.sub('',txt)
+                        # split words
+                        words = txt.split(" ")
+                        for w in words:
+                            w=w.strip()
+                            if not (w==''):
+                                result.append(w)
 
-                        result.append(w.lstrip().rstrip())
+        #logging.debug("split '%s' into %s"%(lst,repr(result)))
         return result
 
- 
+
+class graphemeSplitter(cdliSplitter):
+    bounds=graphemeBounds
+    boundsex=re.compile(graphemeBounds)
+    ignore=graphemeIgnore
+    ignorex=re.compile(graphemeIgnore)
+    indexName="graphemeSplitter"
+    
+class wordSplitter(cdliSplitter):
+    bounds=wordBounds
+    boundsex=re.compile(wordBounds)
+    ignore=wordIgnore
+    ignorex=re.compile(wordIgnore)
+    indexName="wordSplitter"
+      
 try:
     element_factory.registerFactory('Word Splitter',
           'CDLI grapheme splitter', graphemeSplitter)
@@ -96,9 +120,10 @@ except:
     # in case the splitter is already registered, ValueError is raised
     pass
 
-if __name__ == '__main__':
-   a = 'abc def我们的很 好。'
-   u = unicode(a, 'gbk')
-   s = authorSplitter()
-   print s.process([u])
-   print s.process([u], 1)
+try:
+    element_factory.registerFactory('Word Splitter',
+          'CDLI word splitter', wordSplitter)
+except:
+    # in case the splitter is already registered, ValueError is raised
+    pass
+