1: """
2: Author splitter
3: """
4:
5: import Zope2
6: import transaction
7:
8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
13: import logging
14: try:
15: import PyLucene
16: except:
17: print "no Lucene support"
18:
19: def getSupportedEncoding(encodings):
20: for encoding in encodings:
21: try:
22: unicode('A', encoding)
23: return encoding
24: except:
25: pass
26: return 'utf-8'
27:
28:
29:
30: """beta of a fulltext splitter for cdli
31:
32: """
33: ignoreLines=['$','@','#','&']
34: separators=['']
35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
36: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
37: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
38:
39: class IndexLine(object):
40: """index a line with lucene"""
41:
42: def __init__(self, storeDir, analyzer,name,line,content):
43: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
44: if not os.path.exists(storeDir):
45: os.mkdir(storeDir)
46: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
47: writer = PyLucene.IndexWriter(store, analyzer, True)
48: writer.setMaxFieldLength(1048576)
49: self.indexDocs(writer,name,line,content)
50: writer.optimize()
51: writer.close()
52:
53: def indexDocs(self, writer,name,line,content):
54:
55: doc = PyLucene.Document()
56: doc.add(PyLucene.Field("name", pn,
57: PyLucene.Field.Store.YES,
58: PyLucene.Field.Index.UN_TOKENIZED))
59:
60: doc.add(PyLucene.Field("line", str(i),
61: PyLucene.Field.Store.YES,
62: PyLucene.Field.Index.UN_TOKENIZED))
63:
64:
65: doc.add(PyLucene.Field("contents", line,
66: PyLucene.Field.Store.YES,
67: PyLucene.Field.Index.TOKENIZED))
68:
69: writer.addDocument(doc)
70:
71: class cdliSplitter:
72: """basis class for splitter,
73: der Unterschied zwischen Word und Graphemesplitter
74: ist lediglich die unterschiedliche Auschliengsliste"""
75:
76: default_encoding = "utf-8"
77: delete=deleteGraphems
78: indexName="cdliSplitter"
79:
80:
81: def process(self, lst):
82: result = []
83: pNum=None
84: lineNum=None
85:
86: for t in lst:
87:
88: t.replace("\r","\n")
89: for s in t.split("\n"):
90:
91: if type(s) is StringType: # not unicode
92: s = unicode(s, self.default_encoding, 'replace')
93:
94: if (s!="") and (s[0]=="&"): # store pNum
95: pNum=s[1:8]
96: logging.debug("storing: %s"%pNum)
97: elif (s!="") and (not (s[0] in ignoreLines)):
98: splitted=s.split(".")
99:
100: if len(splitted)==1: #kein punkt
101: txt=splitted[0]
102: else:
103: txt=splitted[1]
104: lineNum=splitted[0] #store line number
105:
106: analyse=txt
107: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
108: analyse=re.sub(self.delete,' ',analyse) # deletions
109:
110: if self.indexName=="luceneSplitter":
111: if pNum:
112: analyser=PyLucene.StandardAnalyzer()
113: logging.error("calling lucene")
114:
115: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
116: else:
117: splitted = analyse.split(" ")
118:
119:
120: for w in splitted:
121: w=w.lstrip().rstrip()
122:
123: if not (w==''):
124: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
125:
126: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
127: transaction.get().commit()
128:
129: result.append(w.lstrip().rstrip())
130: return result
131:
132:
133: class graphemeSplitter(cdliSplitter):
134: delete=deleteGraphems
135: indexName="graphemeSplitter"
136:
137: class wordSplitter(cdliSplitter):
138: delete=deleteWords
139: indexName="wordSplitter"
140:
141: class luceneSplitter(cdliSplitter):
142: delete=deleteWords
143: indexName="luceneSplitter"
144:
145:
146: try:
147: element_factory.registerFactory('Word Splitter',
148: 'CDLI grapheme splitter', graphemeSplitter)
149: except:
150: # in case the splitter is already registered, ValueError is raised
151: pass
152:
153: try:
154: element_factory.registerFactory('Word Splitter',
155: 'CDLI word splitter', wordSplitter)
156: except:
157: # in case the splitter is already registered, ValueError is raised
158: pass
159:
160: try:
161: element_factory.registerFactory('Word Splitter',
162: 'CDLI lucene splitter', luceneSplitter)
163: except:
164: # in case the splitter is already registered, ValueError is raised
165: pass
166: if __name__ == '__main__':
167: a = 'abc def我们的很 好。'
168: u = unicode(a, 'gbk')
169: s = authorSplitter()
170: print s.process([u])
171: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>