1: """
2: Author splitter
3: """
4:
5: import Zope2
6: import transaction
7:
8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
13: import logging
14:
15: import PyLucene
16:
17: def getSupportedEncoding(encodings):
18: for encoding in encodings:
19: try:
20: unicode('A', encoding)
21: return encoding
22: except:
23: pass
24: return 'utf-8'
25:
26:
27:
28: """beta of a fulltext splitter for cdli
29:
30: """
31: ignoreLines=['$','@','#','&']
32: separators=['']
33: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
34: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
35: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
36:
37: class IndexLine(object):
38: """index a line with lucene"""
39:
40: def __init__(self, storeDir, analyzer,name,line,content):
41: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
42: if not os.path.exists(storeDir):
43: os.mkdir(storeDir)
44: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
45: writer = PyLucene.IndexWriter(store, analyzer, True)
46: writer.setMaxFieldLength(1048576)
47: self.indexDocs(writer,name,line,content)
48: writer.optimize()
49: writer.close()
50:
51: def indexDocs(self, writer,name,line,content):
52:
53: doc = PyLucene.Document()
54: doc.add(PyLucene.Field("name", pn,
55: PyLucene.Field.Store.YES,
56: PyLucene.Field.Index.UN_TOKENIZED))
57:
58: doc.add(PyLucene.Field("line", str(i),
59: PyLucene.Field.Store.YES,
60: PyLucene.Field.Index.UN_TOKENIZED))
61:
62:
63: doc.add(PyLucene.Field("contents", line,
64: PyLucene.Field.Store.YES,
65: PyLucene.Field.Index.TOKENIZED))
66:
67: writer.addDocument(doc)
68:
69: class cdliSplitter:
70: """basis class for splitter,
71: der Unterschied zwischen Word und Graphemesplitter
72: ist lediglich die unterschiedliche Auschliengsliste"""
73:
74: default_encoding = "utf-8"
75: delete=deleteGraphems
76: indexName="cdliSplitter"
77:
78:
79: def process(self, lst):
80: result = []
81: pNum=None
82: lineNum=None
83:
84: for t in lst:
85:
86: t.replace("\r","\n")
87: for s in t.split("\n"):
88:
89: if type(s) is StringType: # not unicode
90: s = unicode(s, self.default_encoding, 'replace')
91:
92: if (s!="") and (s[0]=="&"): # store pNum
93: pNum=s[1:8]
94: logging.debug("storing: %s"%pNum)
95: elif (s!="") and (not (s[0] in ignoreLines)):
96: splitted=s.split(".")
97:
98: if len(splitted)==1: #kein punkt
99: txt=splitted[0]
100: else:
101: txt=splitted[1]
102: lineNum=splitted[0] #store line number
103:
104: analyse=txt
105: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
106: analyse=re.sub(self.delete,' ',analyse) # deletions
107:
108: if self.indexName=="luceneSplitter":
109: if pNum:
110: analyser=PyLucene.StandardAnalyzer()
111: logging.error("calling lucene")
112:
113: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
114: else:
115: splitted = analyse.split(" ")
116:
117:
118: for w in splitted:
119: w=w.lstrip().rstrip()
120:
121: if not (w==''):
122: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
123:
124: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
125: transaction.get().commit()
126:
127: result.append(w.lstrip().rstrip())
128: return result
129:
130:
131: class graphemeSplitter(cdliSplitter):
132: delete=deleteGraphems
133: indexName="graphemeSplitter"
134:
135: class wordSplitter(cdliSplitter):
136: delete=deleteWords
137: indexName="wordSplitter"
138:
139: class luceneSplitter(cdliSplitter):
140: delete=deleteWords
141: indexName="luceneSplitter"
142:
143:
144: try:
145: element_factory.registerFactory('Word Splitter',
146: 'CDLI grapheme splitter', graphemeSplitter)
147: except:
148: # in case the splitter is already registered, ValueError is raised
149: pass
150:
151: try:
152: element_factory.registerFactory('Word Splitter',
153: 'CDLI word splitter', wordSplitter)
154: except:
155: # in case the splitter is already registered, ValueError is raised
156: pass
157:
158: try:
159: element_factory.registerFactory('Word Splitter',
160: 'CDLI lucene splitter', luceneSplitter)
161: except:
162: # in case the splitter is already registered, ValueError is raised
163: pass
164: if __name__ == '__main__':
165: a = 'abc def我们的很 好。'
166: u = unicode(a, 'gbk')
167: s = authorSplitter()
168: print s.process([u])
169: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>