--- cdli/cdliSplitter.py	2007/08/31 14:22:52	1.7
+++ cdli/cdliSplitter.py	2007/12/03 21:30:19	1.7.2.6
@@ -1,20 +1,11 @@
 """
-Author splitter
+CDLI word and grapheme splitter
 """
 
-import Zope2
-import transaction
-
-from Products.ZCTextIndex.ISplitter import ISplitter
 from Products.ZCTextIndex.PipelineFactory import element_factory
 
 import re
-from types import StringType
 import logging
-try:
-	import PyLucene
-except:
-	print "no Lucene support"
 
 def getSupportedEncoding(encodings):
     for encoding in encodings:
@@ -32,116 +23,82 @@ def getSupportedEncoding(encodings):
 """
 ignoreLines=['$','@','#','&','>']
 separators=['']
-komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
-deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
-deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
-
-class IndexLine(object):
-    """index a line with lucene"""
-
-    def __init__(self, storeDir, analyzer,name,line,content):
-        logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
-        if not os.path.exists(storeDir):
-            os.mkdir(storeDir)
-        store = PyLucene.FSDirectory.getDirectory(storeDir, True)
-        writer = PyLucene.IndexWriter(store, analyzer, True)
-        writer.setMaxFieldLength(1048576)
-        self.indexDocs(writer,name,line,content)  
-        writer.optimize()
-        writer.close()
-      
-    def indexDocs(self, writer,name,line,content):
-       
-        doc = PyLucene.Document()
-        doc.add(PyLucene.Field("name", pn,
-                               PyLucene.Field.Store.YES,
-                               PyLucene.Field.Index.UN_TOKENIZED))
-      
-        doc.add(PyLucene.Field("line", str(i),
-                               PyLucene.Field.Store.YES,
-                               PyLucene.Field.Index.UN_TOKENIZED))
-      
-                
-        doc.add(PyLucene.Field("contents", line,
-                               PyLucene.Field.Store.YES,
-                               PyLucene.Field.Index.TOKENIZED))
-        
-        writer.addDocument(doc)
+# kommas relevant for graphemes will not be deleted
+komma_exception="([^sStThH])," 
+# grapheme boundaries
+#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
+graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
+# for words 
+#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
+wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\""
+
            
 class cdliSplitter:
-    """basis class for splitter, 
-    der Unterschied zwischen Word und Graphemesplitter 
-    ist lediglich die unterschiedliche Auschlie吟ngsliste"""
+    """base class for splitter. 
+    the difference between word and grapheme splitter 
+    is the word boundary list."""
     
     default_encoding = "utf-8"
-    delete=deleteGraphems
+    bounds=graphemeBounds
     indexName="cdliSplitter"
     
     
     def process(self, lst):
+        """gets a list of strings and returns a list of words"""
+        
+        logging.debug("cdliSplitter: %s"%self.indexName) 
         result = []
         pNum=None
         lineNum=None
     
         for t in lst:
-      
-         t.replace("\r","\n")
-         for s in t.split("\n"):
-      
-            if type(s) is StringType: # not unicode
-                s = unicode(s, self.default_encoding, 'replace')
-     
-            if (s!="") and (s[0]=="&"): # store pNum
-                pNum=s[1:8]
-                logging.debug("storing: %s"%pNum)    
-            elif (s!="") and (not (s[0] in ignoreLines)):
-                splitted=s.split(".")
-               
-                if len(splitted)==1: #kein punkt
-                    txt=splitted[0]
-                else:
-                    txt=splitted[1]
-                    lineNum=splitted[0] #store line number
-                
-                analyse=txt      
-                analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
-                analyse=re.sub(self.delete,' ',analyse) # deletions
-                
-                if self.indexName=="luceneSplitter":
-                    if pNum:
-                        analyser=PyLucene.StandardAnalyzer()
-                        logging.error("calling lucene")
+            # normalise line breaks
+            t.replace("\r","\n")
+            # split lines
+            for s in t.split("\n"):
+                if isinstance(s, str): 
+                    # not unicode
+                    s = unicode(s, self.default_encoding, 'replace')
+         
+                if (s!=''):
+                    if s[0]=='&': 
+                        # store pNum
+                        pNum=s[1:8]
+                        logging.debug("%s processing: %s"%(self.indexName,pNum))
                         
-                        IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
-                else:
-                    splitted = analyse.split(" ")
-                   
-                   
-                    for w in splitted:
-                        w=w.lstrip().rstrip()
-    
-                        if not (w==''):
-                            if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
-    
-                                Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
-                                transaction.get().commit()
-    
-                            result.append(w.lstrip().rstrip())
+                    elif not (s[0] in ignoreLines):
+                        # regular line
+                        lineparts=s.split(".")
+                        if len(lineparts)==1: 
+                            # no line number
+                            txt=s
+                        else:
+                            #store line number
+                            txt=lineparts[1]
+                            lineNum=lineparts[0] 
+                            
+                        # delete kommata except kommata relevant for graphemes
+                        txt = re.sub(komma_exception,r"\1",txt)
+                        # replace word boundaries by spaces
+                        txt = re.sub(self.bounds,' ',txt)
+                        # split words
+                        words = txt.split(" ")
+                        for w in words:
+                            w=w.strip()
+                            if not (w==''):
+                                result.append(w)
+
+        logging.debug("split '%s' into %s"%(lst,repr(result)))
         return result
 
 
 class graphemeSplitter(cdliSplitter):
-    delete=deleteGraphems
+    bounds=graphemeBounds
     indexName="graphemeSplitter"
     
 class wordSplitter(cdliSplitter):
-    delete=deleteWords
+    bounds=wordBounds
     indexName="wordSplitter"
-
-class luceneSplitter(cdliSplitter):
-    delete=deleteWords
-    indexName="luceneSplitter"
-    
       
 try:
     element_factory.registerFactory('Word Splitter',
@@ -157,15 +114,3 @@ except:
     # in case the splitter is already registered, ValueError is raised
     pass
 
-try:
-    element_factory.registerFactory('Word Splitter',
-          'CDLI lucene splitter', luceneSplitter)
-except:
-    # in case the splitter is already registered, ValueError is raised
-    pass
-if __name__ == '__main__':
-   a = 'abc def扂蠅腔竭 疑﹝'
-   u = unicode(a, 'gbk')
-   s = authorSplitter()
-   print s.process([u])
-   print s.process([u], 1)