1: """
2: Author splitter
3: """
4:
5: from Products.ZCTextIndex.ISplitter import ISplitter
6: from Products.ZCTextIndex.PipelineFactory import element_factory
7:
8: import re
9: from types import StringType
10:
11: def getSupportedEncoding(encodings):
12: for encoding in encodings:
13: try:
14: unicode('A', encoding)
15: return encoding
16: except:
17: pass
18: return 'utf-8'
19:
20: # CJK charsets ranges, see this following pages:
21: #
22: # http://jrgraphix.net/research/unicode_blocks.php?block=87
23: # http://jrgraphix.net/research/unicode_blocks.php?block=85
24: # http://jrgraphix.net/research/unicode_blocks.php?block=95
25: # http://jrgraphix.net/research/unicode_blocks.php?block=76
26: # http://jrgraphix.net/research/unicode_blocks.php?block=90
27:
28: """beta of a author splitter for echo
29: RULE is NAME1;NAME2 generates a single Lexicon entry for every name
30:
31: """
32: class authorSplitter:
33:
34: default_encoding = "utf-8"
35:
36: def process(self, lst):
37: result = []
38:
39: for s in lst:
40: if type(s) is StringType: # not unicode
41: s = unicode(s, self.default_encoding, 'replace')
42:
43: splitted = s.split(";")
44:
45: for w in splitted:
46: if not (w[0:2]=="!!"): #don't index !!NOT USED....
47: result.append(w.lstrip().rstrip())
48: return result
49:
50:
51: try:
52: element_factory.registerFactory('Word Splitter',
53: 'ECHO author splitter', authorSplitter)
54: except:
55: # in case the splitter is already registered, ValueError is raised
56: pass
57:
58: if __name__ == '__main__':
59: a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4\U00DC\U00C3\U00A1\U00A3'
60: u = unicode(a, 'gbk')
61: s = authorSplitter()
62: print s.process([u])
63: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>