1: """
2: Author splitter
3: """
4:
5: from Products.ZCTextIndex.ISplitter import ISplitter
6: from Products.ZCTextIndex.PipelineFactory import element_factory
7:
8: import re
9: from types import StringType
10:
11: def getSupportedEncoding(encodings):
12: for encoding in encodings:
13: try:
14: unicode('A', encoding)
15: return encoding
16: except:
17: pass
18: return 'utf-8'
19:
20:
21:
22: """beta of a fulltext splitter for cdli
23:
24: """
25: ignoreLines=['$','@','#','&']
26: separators=['']
27: delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
28:
29: class graphemeSplitter:
30:
31: default_encoding = "utf-8"
32:
33: def process(self, lst):
34: result = []
35:
36: for t in lst:
37:
38: t.replace("\r","\n")
39: for s in t.split("\n"):
40:
41: if type(s) is StringType: # not unicode
42: s = unicode(s, self.default_encoding, 'replace')
43:
44: #ignore lines
45:
46: if (s!="") and (not (s[0] in ignoreLines)):
47:
48: #ignore everthing bevor "."
49: splitted=s.split(".")
50:
51: if len(splitted)==1: #kein punkt
52: txt=splitted[0]
53: else:
54: txt=splitted[1]
55:
56: analyse=txt
57:
58: analyse=re.sub(delete,' ',analyse) # deletions
59:
60: splitted = analyse.split(" ")
61:
62: for w in splitted:
63: w=w.lstrip().rstrip()
64: if not (w==''):
65: print repr(w)
66: result.append(w.lstrip().rstrip())
67: return result
68:
69: element_factory.registerFactory('Word Splitter',
70: 'CDLI grapheme splitter', graphemeSplitter)
71:
72: try:
73: element_factory.registerFactory('graphemeSplitter',
74: 'CDLI grapheme splitter', graphemeSplitter)
75: except:
76: # in case the splitter is already registered, ValueError is raised
77: pass
78:
79: if __name__ == '__main__':
80: a = 'abc def我们的很 好。'
81: u = unicode(a, 'gbk')
82: s = authorSplitter()
83: print s.process([u])
84: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>