Annotation of cdli/cdliSplitter.py, revision 1.7
1.1 dwinter 1: """
2: Author splitter
3: """
4:
1.4 dwinter 5: import Zope2
1.3 dwinter 6: import transaction
1.2 dwinter 7:
1.1 dwinter 8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
1.5 dwinter 13: import logging
1.6 dwinter 14: try:
15: import PyLucene
16: except:
17: print "no Lucene support"
1.1 dwinter 18:
19: def getSupportedEncoding(encodings):
20: for encoding in encodings:
21: try:
22: unicode('A', encoding)
23: return encoding
24: except:
25: pass
26: return 'utf-8'
27:
28:
29:
30: """beta of a fulltext splitter for cdli
31:
32: """
1.7 ! casties 33: ignoreLines=['$','@','#','&','>']
1.1 dwinter 34: separators=['']
1.2 dwinter 35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
1.5 dwinter 36: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
37: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
1.1 dwinter 38:
1.5 dwinter 39: class IndexLine(object):
40: """index a line with lucene"""
1.1 dwinter 41:
1.5 dwinter 42: def __init__(self, storeDir, analyzer,name,line,content):
43: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
44: if not os.path.exists(storeDir):
45: os.mkdir(storeDir)
46: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
47: writer = PyLucene.IndexWriter(store, analyzer, True)
48: writer.setMaxFieldLength(1048576)
49: self.indexDocs(writer,name,line,content)
50: writer.optimize()
51: writer.close()
52:
53: def indexDocs(self, writer,name,line,content):
54:
55: doc = PyLucene.Document()
56: doc.add(PyLucene.Field("name", pn,
57: PyLucene.Field.Store.YES,
58: PyLucene.Field.Index.UN_TOKENIZED))
59:
60: doc.add(PyLucene.Field("line", str(i),
61: PyLucene.Field.Store.YES,
62: PyLucene.Field.Index.UN_TOKENIZED))
63:
64:
65: doc.add(PyLucene.Field("contents", line,
66: PyLucene.Field.Store.YES,
67: PyLucene.Field.Index.TOKENIZED))
68:
69: writer.addDocument(doc)
70:
71: class cdliSplitter:
72: """basis class for splitter,
73: der Unterschied zwischen Word und Graphemesplitter
74: ist lediglich die unterschiedliche Auschliengsliste"""
75:
1.1 dwinter 76: default_encoding = "utf-8"
1.5 dwinter 77: delete=deleteGraphems
78: indexName="cdliSplitter"
79:
1.2 dwinter 80:
1.1 dwinter 81: def process(self, lst):
82: result = []
1.2 dwinter 83: pNum=None
84: lineNum=None
1.5 dwinter 85:
1.1 dwinter 86: for t in lst:
1.2 dwinter 87:
1.1 dwinter 88: t.replace("\r","\n")
89: for s in t.split("\n"):
1.2 dwinter 90:
1.1 dwinter 91: if type(s) is StringType: # not unicode
92: s = unicode(s, self.default_encoding, 'replace')
1.5 dwinter 93:
1.2 dwinter 94: if (s!="") and (s[0]=="&"): # store pNum
95: pNum=s[1:8]
1.5 dwinter 96: logging.debug("storing: %s"%pNum)
1.2 dwinter 97: elif (s!="") and (not (s[0] in ignoreLines)):
1.1 dwinter 98: splitted=s.split(".")
1.2 dwinter 99:
1.1 dwinter 100: if len(splitted)==1: #kein punkt
101: txt=splitted[0]
102: else:
103: txt=splitted[1]
1.2 dwinter 104: lineNum=splitted[0] #store line number
1.1 dwinter 105:
1.5 dwinter 106: analyse=txt
107: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
108: analyse=re.sub(self.delete,' ',analyse) # deletions
1.2 dwinter 109:
1.5 dwinter 110: if self.indexName=="luceneSplitter":
111: if pNum:
112: analyser=PyLucene.StandardAnalyzer()
113: logging.error("calling lucene")
114:
115: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
116: else:
117: splitted = analyse.split(" ")
118:
119:
120: for w in splitted:
121: w=w.lstrip().rstrip()
122:
123: if not (w==''):
124: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
125:
126: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
127: transaction.get().commit()
128:
129: result.append(w.lstrip().rstrip())
130: return result
1.2 dwinter 131:
132:
1.5 dwinter 133: class graphemeSplitter(cdliSplitter):
134: delete=deleteGraphems
135: indexName="graphemeSplitter"
136:
137: class wordSplitter(cdliSplitter):
138: delete=deleteWords
139: indexName="wordSplitter"
140:
141: class luceneSplitter(cdliSplitter):
142: delete=deleteWords
143: indexName="luceneSplitter"
144:
145:
146: try:
147: element_factory.registerFactory('Word Splitter',
148: 'CDLI grapheme splitter', graphemeSplitter)
149: except:
150: # in case the splitter is already registered, ValueError is raised
151: pass
1.2 dwinter 152:
1.5 dwinter 153: try:
154: element_factory.registerFactory('Word Splitter',
155: 'CDLI word splitter', wordSplitter)
156: except:
157: # in case the splitter is already registered, ValueError is raised
158: pass
1.1 dwinter 159:
160: try:
1.2 dwinter 161: element_factory.registerFactory('Word Splitter',
1.5 dwinter 162: 'CDLI lucene splitter', luceneSplitter)
1.1 dwinter 163: except:
164: # in case the splitter is already registered, ValueError is raised
165: pass
166: if __name__ == '__main__':
167: a = 'abc def我们的很 好。'
168: u = unicode(a, 'gbk')
169: s = authorSplitter()
170: print s.process([u])
171: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>