1: """
2: Author splitter
3: """
4:
5: import Zope
6:
7: from Products.ZCTextIndex.ISplitter import ISplitter
8: from Products.ZCTextIndex.PipelineFactory import element_factory
9:
10: import re
11: from types import StringType
12:
13: def getSupportedEncoding(encodings):
14: for encoding in encodings:
15: try:
16: unicode('A', encoding)
17: return encoding
18: except:
19: pass
20: return 'utf-8'
21:
22:
23:
24: """beta of a fulltext splitter for cdli
25:
26: """
27: ignoreLines=['$','@','#','&']
28: separators=['']
29: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
30: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
31: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
32:
33: class graphemeSplitter:
34:
35: default_encoding = "utf-8"
36:
37: def process(self, lst):
38: result = []
39: pNum=None
40: lineNum=None
41:
42:
43: #print "LLLL",lst
44:
45:
46: for t in lst:
47:
48: t.replace("\r","\n")
49: for s in t.split("\n"):
50:
51: if type(s) is StringType: # not unicode
52: s = unicode(s, self.default_encoding, 'replace')
53:
54: #ignore lines
55:
56: if (s!="") and (s[0]=="&"): # store pNum
57: pNum=s[1:8]
58:
59: elif (s!="") and (not (s[0] in ignoreLines)):
60:
61:
62: #ignore everthing bevor "."
63: splitted=s.split(".")
64:
65: if len(splitted)==1: #kein punkt
66: txt=splitted[0]
67: else:
68: txt=splitted[1]
69: lineNum=splitted[0] #store line number
70:
71: analyse=txt
72:
73: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
74:
75: analyse=re.sub(delete,' ',analyse) # deletions
76:
77: splitted = analyse.split(" ")
78:
79: for w in splitted:
80: w=w.lstrip().rstrip()
81:
82: if not (w==''):
83: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
84: Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
85: get_transaction().commit()
86:
87: result.append(w.lstrip().rstrip())
88: return result
89:
90:
91: try:
92: element_factory.registerFactory('Word Splitter',
93: 'CDLI grapheme splitter', graphemeSplitter)
94: except:
95: # in case the splitter is already registered, ValueError is raised
96: pass
97:
98: if __name__ == '__main__':
99: a = 'abc def我们的很 好。'
100: u = unicode(a, 'gbk')
101: s = authorSplitter()
102: print s.process([u])
103: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>