--- cdli/cdliSplitter.py 2006/11/14 17:02:59 1.1 +++ cdli/cdliSplitter.py 2006/12/22 11:56:08 1.2 @@ -2,6 +2,8 @@ Author splitter """ +import Zope + from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory @@ -24,53 +26,70 @@ def getSupportedEncoding(encodings): """ ignoreLines=['$','@','#','&'] separators=[''] -delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\[" +komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted +delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems +#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words class graphemeSplitter: default_encoding = "utf-8" - + def process(self, lst): result = [] + pNum=None + lineNum=None + + #print "LLLL",lst + + for t in lst: - + t.replace("\r","\n") for s in t.split("\n"): - + if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') #ignore lines - - if (s!="") and (not (s[0] in ignoreLines)): + + if (s!="") and (s[0]=="&"): # store pNum + pNum=s[1:8] + + elif (s!="") and (not (s[0] in ignoreLines)): + #ignore everthing bevor "." splitted=s.split(".") - + if len(splitted)==1: #kein punkt txt=splitted[0] else: txt=splitted[1] + lineNum=splitted[0] #store line number analyse=txt - - analyse=re.sub(delete,' ',analyse) # deletions + analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems + + analyse=re.sub(delete,' ',analyse) # deletions + splitted = analyse.split(" ") for w in splitted: w=w.lstrip().rstrip() + if not (w==''): - print repr(w) + if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline + Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) + get_transaction().commit() + result.append(w.lstrip().rstrip()) return result -element_factory.registerFactory('Word Splitter', - 'CDLI grapheme splitter', graphemeSplitter) try: - element_factory.registerFactory('graphemeSplitter', + element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised