--- cdli/cdliSplitter.py	2007/02/08 12:00:23	1.4
+++ cdli/cdliSplitter.py	2007/03/21 19:29:23	1.5
@@ -10,6 +10,9 @@ from Products.ZCTextIndex.PipelineFactor
 
 import re
 from types import StringType
+import logging
+
+import PyLucene
 
 def getSupportedEncoding(encodings):
     for encoding in encodings:
@@ -28,22 +31,56 @@ def getSupportedEncoding(encodings):
 ignoreLines=['$','@','#','&']
 separators=['']
 komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
-delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
-#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
+deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
+deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
 
-class graphemeSplitter:
+class IndexLine(object):
+    """index a line with lucene"""
 
+    def __init__(self, storeDir, analyzer,name,line,content):
+        logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
+        if not os.path.exists(storeDir):
+            os.mkdir(storeDir)
+        store = PyLucene.FSDirectory.getDirectory(storeDir, True)
+        writer = PyLucene.IndexWriter(store, analyzer, True)
+        writer.setMaxFieldLength(1048576)
+        self.indexDocs(writer,name,line,content)  
+        writer.optimize()
+        writer.close()
+      
+    def indexDocs(self, writer,name,line,content):
+       
+        doc = PyLucene.Document()
+        doc.add(PyLucene.Field("name", pn,
+                               PyLucene.Field.Store.YES,
+                               PyLucene.Field.Index.UN_TOKENIZED))
+      
+        doc.add(PyLucene.Field("line", str(i),
+                               PyLucene.Field.Store.YES,
+                               PyLucene.Field.Index.UN_TOKENIZED))
+      
+                
+        doc.add(PyLucene.Field("contents", line,
+                               PyLucene.Field.Store.YES,
+                               PyLucene.Field.Index.TOKENIZED))
+        
+        writer.addDocument(doc)
+           
+class cdliSplitter:
+    """basis class for splitter, 
+    der Unterschied zwischen Word und Graphemesplitter 
+    ist lediglich die unterschiedliche Auschlie吟ngsliste"""
+    
     default_encoding = "utf-8"
+    delete=deleteGraphems
+    indexName="cdliSplitter"
+    
     
     def process(self, lst):
         result = []
         pNum=None
         lineNum=None
-       
-  
-        #print "LLLL",lst
-        
-      
+    
         for t in lst:
       
          t.replace("\r","\n")
@@ -51,16 +88,11 @@ class graphemeSplitter:
       
             if type(s) is StringType: # not unicode
                 s = unicode(s, self.default_encoding, 'replace')
-            
-            #ignore lines
-
+     
             if (s!="") and (s[0]=="&"): # store pNum
                 pNum=s[1:8]
-
+                logging.debug("storing: %s"%pNum)    
             elif (s!="") and (not (s[0] in ignoreLines)):
-
-              
-                #ignore everthing bevor "."
                 splitted=s.split(".")
                
                 if len(splitted)==1: #kein punkt
@@ -69,26 +101,46 @@ class graphemeSplitter:
                     txt=splitted[1]
                     lineNum=splitted[0] #store line number
                 
-                analyse=txt
-                
+                analyse=txt      
                 analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
-
-                analyse=re.sub(delete,' ',analyse) # deletions
-
-                splitted = analyse.split(" ")
-               
-                for w in splitted:
-                    w=w.lstrip().rstrip()
-
-                    if not (w==''):
-                        if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
-                            Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
-                            transaction.get().commit()
-
-                        result.append(w.lstrip().rstrip())
+                analyse=re.sub(self.delete,' ',analyse) # deletions
+                
+                if self.indexName=="luceneSplitter":
+                    if pNum:
+                        analyser=PyLucene.StandardAnalyzer()
+                        logging.error("calling lucene")
+                        
+                        IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
+                else:
+                    splitted = analyse.split(" ")
+                   
+                   
+                    for w in splitted:
+                        w=w.lstrip().rstrip()
+    
+                        if not (w==''):
+                            if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
+    
+                                Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
+                                transaction.get().commit()
+    
+                            result.append(w.lstrip().rstrip())
         return result
 
- 
+
+class graphemeSplitter(cdliSplitter):
+    delete=deleteGraphems
+    indexName="graphemeSplitter"
+    
+class wordSplitter(cdliSplitter):
+    delete=deleteWords
+    indexName="wordSplitter"
+
+class luceneSplitter(cdliSplitter):
+    delete=deleteWords
+    indexName="luceneSplitter"
+    
+      
 try:
     element_factory.registerFactory('Word Splitter',
           'CDLI grapheme splitter', graphemeSplitter)
@@ -96,6 +148,19 @@ except:
     # in case the splitter is already registered, ValueError is raised
     pass
 
+try:
+    element_factory.registerFactory('Word Splitter',
+          'CDLI word splitter', wordSplitter)
+except:
+    # in case the splitter is already registered, ValueError is raised
+    pass
+
+try:
+    element_factory.registerFactory('Word Splitter',
+          'CDLI lucene splitter', luceneSplitter)
+except:
+    # in case the splitter is already registered, ValueError is raised
+    pass
 if __name__ == '__main__':
    a = 'abc def扂蠅腔竭 疑﹝'
    u = unicode(a, 'gbk')