""" Author splitter """ from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re from types import StringType def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' """beta of a fulltext splitter for cdli """ ignoreLines=['$','@','#','&'] separators=[''] delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\[" class graphemeSplitter: default_encoding = "utf-8" def process(self, lst): result = [] for t in lst: t.replace("\r","\n") for s in t.split("\n"): if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') #ignore lines if (s!="") and (not (s[0] in ignoreLines)): #ignore everthing bevor "." splitted=s.split(".") if len(splitted)==1: #kein punkt txt=splitted[0] else: txt=splitted[1] analyse=txt analyse=re.sub(delete,' ',analyse) # deletions splitted = analyse.split(" ") for w in splitted: w=w.lstrip().rstrip() if not (w==''): print repr(w) result.append(w.lstrip().rstrip()) return result element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) try: element_factory.registerFactory('graphemeSplitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def我们的很 好。' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)