Annotation of cdli/cdliSplitter.py, revision 1.2
1.1 dwinter 1: """
2: Author splitter
3: """
4:
1.2 ! dwinter 5: import Zope
! 6:
1.1 dwinter 7: from Products.ZCTextIndex.ISplitter import ISplitter
8: from Products.ZCTextIndex.PipelineFactory import element_factory
9:
10: import re
11: from types import StringType
12:
13: def getSupportedEncoding(encodings):
14: for encoding in encodings:
15: try:
16: unicode('A', encoding)
17: return encoding
18: except:
19: pass
20: return 'utf-8'
21:
22:
23:
24: """beta of a fulltext splitter for cdli
25:
26: """
27: ignoreLines=['$','@','#','&']
28: separators=['']
1.2 ! dwinter 29: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
! 30: delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
! 31: #delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words
1.1 dwinter 32:
33: class graphemeSplitter:
34:
35: default_encoding = "utf-8"
1.2 ! dwinter 36:
1.1 dwinter 37: def process(self, lst):
38: result = []
1.2 ! dwinter 39: pNum=None
! 40: lineNum=None
1.1 dwinter 41:
1.2 ! dwinter 42:
! 43: #print "LLLL",lst
! 44:
! 45:
1.1 dwinter 46: for t in lst:
1.2 ! dwinter 47:
1.1 dwinter 48: t.replace("\r","\n")
49: for s in t.split("\n"):
1.2 ! dwinter 50:
1.1 dwinter 51: if type(s) is StringType: # not unicode
52: s = unicode(s, self.default_encoding, 'replace')
53:
54: #ignore lines
1.2 ! dwinter 55:
! 56: if (s!="") and (s[0]=="&"): # store pNum
! 57: pNum=s[1:8]
! 58:
! 59: elif (s!="") and (not (s[0] in ignoreLines)):
! 60:
1.1 dwinter 61:
62: #ignore everthing bevor "."
63: splitted=s.split(".")
1.2 ! dwinter 64:
1.1 dwinter 65: if len(splitted)==1: #kein punkt
66: txt=splitted[0]
67: else:
68: txt=splitted[1]
1.2 ! dwinter 69: lineNum=splitted[0] #store line number
1.1 dwinter 70:
71: analyse=txt
1.2 ! dwinter 72:
! 73: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
! 74:
1.1 dwinter 75: analyse=re.sub(delete,' ',analyse) # deletions
1.2 ! dwinter 76:
1.1 dwinter 77: splitted = analyse.split(" ")
78:
79: for w in splitted:
80: w=w.lstrip().rstrip()
1.2 ! dwinter 81:
1.1 dwinter 82: if not (w==''):
1.2 ! dwinter 83: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
! 84: Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
! 85: get_transaction().commit()
! 86:
1.1 dwinter 87: result.append(w.lstrip().rstrip())
88: return result
89:
90:
91: try:
1.2 ! dwinter 92: element_factory.registerFactory('Word Splitter',
1.1 dwinter 93: 'CDLI grapheme splitter', graphemeSplitter)
94: except:
95: # in case the splitter is already registered, ValueError is raised
96: pass
97:
98: if __name__ == '__main__':
99: a = 'abc def我们的很 好。'
100: u = unicode(a, 'gbk')
101: s = authorSplitter()
102: print s.process([u])
103: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>