1: """
2: Author splitter
3: """
4:
5: import Zope
6: import transaction
7:
8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
13:
14: def getSupportedEncoding(encodings):
15: for encoding in encodings:
16: try:
17: unicode('A', encoding)
18: return encoding
19: except:
20: pass
21: return 'utf-8'
22:
23:
24:
25: """beta of a fulltext splitter for cdli
26:
27: """
28: ignoreLines=['$','@','#','&']
29: separators=['']
30: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
31: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
32: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
33:
34: class graphemeSplitter:
35:
36: default_encoding = "utf-8"
37:
38: def process(self, lst):
39: result = []
40: pNum=None
41: lineNum=None
42:
43:
44: #print "LLLL",lst
45:
46:
47: for t in lst:
48:
49: t.replace("\r","\n")
50: for s in t.split("\n"):
51:
52: if type(s) is StringType: # not unicode
53: s = unicode(s, self.default_encoding, 'replace')
54:
55: #ignore lines
56:
57: if (s!="") and (s[0]=="&"): # store pNum
58: pNum=s[1:8]
59:
60: elif (s!="") and (not (s[0] in ignoreLines)):
61:
62:
63: #ignore everthing bevor "."
64: splitted=s.split(".")
65:
66: if len(splitted)==1: #kein punkt
67: txt=splitted[0]
68: else:
69: txt=splitted[1]
70: lineNum=splitted[0] #store line number
71:
72: analyse=txt
73:
74: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
75:
76: analyse=re.sub(delete,' ',analyse) # deletions
77:
78: splitted = analyse.split(" ")
79:
80: for w in splitted:
81: w=w.lstrip().rstrip()
82:
83: if not (w==''):
84: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
85: Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
86: transaction.get().commit()
87:
88: result.append(w.lstrip().rstrip())
89: return result
90:
91:
92: try:
93: element_factory.registerFactory('Word Splitter',
94: 'CDLI grapheme splitter', graphemeSplitter)
95: except:
96: # in case the splitter is already registered, ValueError is raised
97: pass
98:
99: if __name__ == '__main__':
100: a = 'abc def我们的很 好。'
101: u = unicode(a, 'gbk')
102: s = authorSplitter()
103: print s.process([u])
104: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>