--- cdli/cdliSplitter.py 2007/03/21 19:29:23 1.5 +++ cdli/cdliSplitter.py 2007/10/06 13:44:46 1.7.2.1 @@ -11,8 +11,10 @@ from Products.ZCTextIndex.PipelineFactor import re from types import StringType import logging - -import PyLucene +try: + import PyLucene +except: + print "no Lucene support" def getSupportedEncoding(encodings): for encoding in encodings: @@ -28,7 +30,7 @@ def getSupportedEncoding(encodings): """beta of a fulltext splitter for cdli """ -ignoreLines=['$','@','#','&'] +ignoreLines=['$','@','#','&','>'] separators=[''] komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems @@ -77,6 +79,7 @@ class cdliSplitter: def process(self, lst): + logging.debug("cdliSplitter") result = [] pNum=None lineNum=None @@ -91,7 +94,8 @@ class cdliSplitter: if (s!="") and (s[0]=="&"): # store pNum pNum=s[1:8] - logging.debug("storing: %s"%pNum) + logging.debug("cdliSplitter processing: %s"%pNum) + elif (s!="") and (not (s[0] in ignoreLines)): splitted=s.split(".") @@ -113,18 +117,15 @@ class cdliSplitter: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) else: splitted = analyse.split(" ") - - for w in splitted: w=w.lstrip().rstrip() if not (w==''): - if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline - - Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) - transaction.get().commit() + #if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline + # Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) + # transaction.get().commit() - result.append(w.lstrip().rstrip()) + result.append(w) return result