"""
Author splitter
"""
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
from types import StringType
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
# CJK charsets ranges, see this following pages:
#
# http://jrgraphix.net/research/unicode_blocks.php?block=87
# http://jrgraphix.net/research/unicode_blocks.php?block=85
# http://jrgraphix.net/research/unicode_blocks.php?block=95
# http://jrgraphix.net/research/unicode_blocks.php?block=76
# http://jrgraphix.net/research/unicode_blocks.php?block=90
"""beta of a author splitter for echo
RULE is NAME1;NAME2 generates a single Lexicon entry for every name
"""
class authorSplitter:
default_encoding = "utf-8"
def process(self, lst):
result = []
for s in lst:
if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')
splitted = s.split(";")
for w in splitted:
if not (w[0:2]=="!!"): #don't index !!NOT USED....
result.append(w.lstrip().rstrip())
return result
try:
element_factory.registerFactory('Word Splitter',
'ECHO author splitter', authorSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
if __name__ == '__main__':
a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4\U00DC\U00C3\U00A1\U00A3'
u = unicode(a, 'gbk')
s = authorSplitter()
print s.process([u])
print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>