Annotation of cdli/cdliSplitter.py, revision 1.7.2.1
1.1 dwinter 1: """
2: Author splitter
3: """
4:
1.4 dwinter 5: import Zope2
1.3 dwinter 6: import transaction
1.2 dwinter 7:
1.1 dwinter 8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
1.5 dwinter 13: import logging
1.6 dwinter 14: try:
15: import PyLucene
16: except:
17: print "no Lucene support"
1.1 dwinter 18:
19: def getSupportedEncoding(encodings):
20: for encoding in encodings:
21: try:
22: unicode('A', encoding)
23: return encoding
24: except:
25: pass
26: return 'utf-8'
27:
28:
29:
30: """beta of a fulltext splitter for cdli
31:
32: """
1.7 casties 33: ignoreLines=['$','@','#','&','>']
1.1 dwinter 34: separators=['']
1.2 dwinter 35: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
1.5 dwinter 36: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
37: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
1.1 dwinter 38:
1.5 dwinter 39: class IndexLine(object):
40: """index a line with lucene"""
1.1 dwinter 41:
1.5 dwinter 42: def __init__(self, storeDir, analyzer,name,line,content):
43: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
44: if not os.path.exists(storeDir):
45: os.mkdir(storeDir)
46: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
47: writer = PyLucene.IndexWriter(store, analyzer, True)
48: writer.setMaxFieldLength(1048576)
49: self.indexDocs(writer,name,line,content)
50: writer.optimize()
51: writer.close()
52:
53: def indexDocs(self, writer,name,line,content):
54:
55: doc = PyLucene.Document()
56: doc.add(PyLucene.Field("name", pn,
57: PyLucene.Field.Store.YES,
58: PyLucene.Field.Index.UN_TOKENIZED))
59:
60: doc.add(PyLucene.Field("line", str(i),
61: PyLucene.Field.Store.YES,
62: PyLucene.Field.Index.UN_TOKENIZED))
63:
64:
65: doc.add(PyLucene.Field("contents", line,
66: PyLucene.Field.Store.YES,
67: PyLucene.Field.Index.TOKENIZED))
68:
69: writer.addDocument(doc)
70:
71: class cdliSplitter:
72: """basis class for splitter,
73: der Unterschied zwischen Word und Graphemesplitter
74: ist lediglich die unterschiedliche Auschliengsliste"""
75:
1.1 dwinter 76: default_encoding = "utf-8"
1.5 dwinter 77: delete=deleteGraphems
78: indexName="cdliSplitter"
79:
1.2 dwinter 80:
1.1 dwinter 81: def process(self, lst):
1.7.2.1 ! casties 82: logging.debug("cdliSplitter")
1.1 dwinter 83: result = []
1.2 dwinter 84: pNum=None
85: lineNum=None
1.5 dwinter 86:
1.1 dwinter 87: for t in lst:
1.2 dwinter 88:
1.1 dwinter 89: t.replace("\r","\n")
90: for s in t.split("\n"):
1.2 dwinter 91:
1.1 dwinter 92: if type(s) is StringType: # not unicode
93: s = unicode(s, self.default_encoding, 'replace')
1.5 dwinter 94:
1.2 dwinter 95: if (s!="") and (s[0]=="&"): # store pNum
96: pNum=s[1:8]
1.7.2.1 ! casties 97: logging.debug("cdliSplitter processing: %s"%pNum)
! 98:
1.2 dwinter 99: elif (s!="") and (not (s[0] in ignoreLines)):
1.1 dwinter 100: splitted=s.split(".")
1.2 dwinter 101:
1.1 dwinter 102: if len(splitted)==1: #kein punkt
103: txt=splitted[0]
104: else:
105: txt=splitted[1]
1.2 dwinter 106: lineNum=splitted[0] #store line number
1.1 dwinter 107:
1.5 dwinter 108: analyse=txt
109: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
110: analyse=re.sub(self.delete,' ',analyse) # deletions
1.2 dwinter 111:
1.5 dwinter 112: if self.indexName=="luceneSplitter":
113: if pNum:
114: analyser=PyLucene.StandardAnalyzer()
115: logging.error("calling lucene")
116:
117: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
118: else:
119: splitted = analyse.split(" ")
120: for w in splitted:
121: w=w.lstrip().rstrip()
122:
123: if not (w==''):
1.7.2.1 ! casties 124: #if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
! 125: # Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
! 126: # transaction.get().commit()
1.5 dwinter 127:
1.7.2.1 ! casties 128: result.append(w)
1.5 dwinter 129: return result
1.2 dwinter 130:
131:
1.5 dwinter 132: class graphemeSplitter(cdliSplitter):
133: delete=deleteGraphems
134: indexName="graphemeSplitter"
135:
136: class wordSplitter(cdliSplitter):
137: delete=deleteWords
138: indexName="wordSplitter"
139:
140: class luceneSplitter(cdliSplitter):
141: delete=deleteWords
142: indexName="luceneSplitter"
143:
144:
145: try:
146: element_factory.registerFactory('Word Splitter',
147: 'CDLI grapheme splitter', graphemeSplitter)
148: except:
149: # in case the splitter is already registered, ValueError is raised
150: pass
1.2 dwinter 151:
1.5 dwinter 152: try:
153: element_factory.registerFactory('Word Splitter',
154: 'CDLI word splitter', wordSplitter)
155: except:
156: # in case the splitter is already registered, ValueError is raised
157: pass
1.1 dwinter 158:
159: try:
1.2 dwinter 160: element_factory.registerFactory('Word Splitter',
1.5 dwinter 161: 'CDLI lucene splitter', luceneSplitter)
1.1 dwinter 162: except:
163: # in case the splitter is already registered, ValueError is raised
164: pass
165: if __name__ == '__main__':
166: a = 'abc def我们的很 好。'
167: u = unicode(a, 'gbk')
168: s = authorSplitter()
169: print s.process([u])
170: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>