""" Author splitter """ from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re from types import StringType def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' # CJK charsets ranges, see this following pages: # # http://jrgraphix.net/research/unicode_blocks.php?block=87 # http://jrgraphix.net/research/unicode_blocks.php?block=85 # http://jrgraphix.net/research/unicode_blocks.php?block=95 # http://jrgraphix.net/research/unicode_blocks.php?block=76 # http://jrgraphix.net/research/unicode_blocks.php?block=90 """beta of a author splitter for echo RULE is NAME1;NAME2 generates a single Lexicon entry for every name """ class authorSplitter: default_encoding = "utf-8" def process(self, lst): result = [] for s in lst: if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') splitted = s.split(";") for w in splitted: if not (w[0:2]=="!!"): #don't index !!NOT USED.... result.append(w.lstrip().rstrip()) return result try: element_factory.registerFactory('Word Splitter', 'ECHO author splitter', authorSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4\U00DC\U00C3\U00A1\U00A3' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)