""" Author splitter """ import Zope import transaction from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re from types import StringType def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' """beta of a fulltext splitter for cdli """ ignoreLines=['$','@','#','&'] separators=[''] komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words class graphemeSplitter: default_encoding = "utf-8" def process(self, lst): result = [] pNum=None lineNum=None #print "LLLL",lst for t in lst: t.replace("\r","\n") for s in t.split("\n"): if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') #ignore lines if (s!="") and (s[0]=="&"): # store pNum pNum=s[1:8] elif (s!="") and (not (s[0] in ignoreLines)): #ignore everthing bevor "." splitted=s.split(".") if len(splitted)==1: #kein punkt txt=splitted[0] else: txt=splitted[1] lineNum=splitted[0] #store line number analyse=txt analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems analyse=re.sub(delete,' ',analyse) # deletions splitted = analyse.split(" ") for w in splitted: w=w.lstrip().rstrip() if not (w==''): if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) transaction.get().commit() result.append(w.lstrip().rstrip()) return result try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def我们的很 好。' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)