Annotation of cdli/cdliSplitter.py, revision 1.3
1.1 dwinter 1: """
2: Author splitter
3: """
4:
1.2 dwinter 5: import Zope
1.3 ! dwinter 6: import transaction
1.2 dwinter 7:
1.1 dwinter 8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
13:
14: def getSupportedEncoding(encodings):
15: for encoding in encodings:
16: try:
17: unicode('A', encoding)
18: return encoding
19: except:
20: pass
21: return 'utf-8'
22:
23:
24:
25: """beta of a fulltext splitter for cdli
26:
27: """
28: ignoreLines=['$','@','#','&']
29: separators=['']
1.2 dwinter 30: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
31: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
32: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
1.1 dwinter 33:
34: class graphemeSplitter:
35:
36: default_encoding = "utf-8"
1.2 dwinter 37:
1.1 dwinter 38: def process(self, lst):
39: result = []
1.2 dwinter 40: pNum=None
41: lineNum=None
1.1 dwinter 42:
1.2 dwinter 43:
44: #print "LLLL",lst
45:
46:
1.1 dwinter 47: for t in lst:
1.2 dwinter 48:
1.1 dwinter 49: t.replace("\r","\n")
50: for s in t.split("\n"):
1.2 dwinter 51:
1.1 dwinter 52: if type(s) is StringType: # not unicode
53: s = unicode(s, self.default_encoding, 'replace')
54:
55: #ignore lines
1.2 dwinter 56:
57: if (s!="") and (s[0]=="&"): # store pNum
58: pNum=s[1:8]
59:
60: elif (s!="") and (not (s[0] in ignoreLines)):
61:
1.1 dwinter 62:
63: #ignore everthing bevor "."
64: splitted=s.split(".")
1.2 dwinter 65:
1.1 dwinter 66: if len(splitted)==1: #kein punkt
67: txt=splitted[0]
68: else:
69: txt=splitted[1]
1.2 dwinter 70: lineNum=splitted[0] #store line number
1.1 dwinter 71:
72: analyse=txt
1.2 dwinter 73:
74: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
75:
1.1 dwinter 76: analyse=re.sub(delete,' ',analyse) # deletions
1.2 dwinter 77:
1.1 dwinter 78: splitted = analyse.split(" ")
79:
80: for w in splitted:
81: w=w.lstrip().rstrip()
1.2 dwinter 82:
1.1 dwinter 83: if not (w==''):
1.2 dwinter 84: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
85: Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
1.3 ! dwinter 86: transaction.get().commit()
1.2 dwinter 87:
1.1 dwinter 88: result.append(w.lstrip().rstrip())
89: return result
90:
91:
92: try:
1.2 dwinter 93: element_factory.registerFactory('Word Splitter',
1.1 dwinter 94: 'CDLI grapheme splitter', graphemeSplitter)
95: except:
96: # in case the splitter is already registered, ValueError is raised
97: pass
98:
99: if __name__ == '__main__':
100: a = 'abc def我们的很 好。'
101: u = unicode(a, 'gbk')
102: s = authorSplitter()
103: print s.process([u])
104: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>