Annotation of cdli/cdliSplitter.py, revision 1.5
1.1 dwinter 1: """
2: Author splitter
3: """
4:
1.4 dwinter 5: import Zope2
1.3 dwinter 6: import transaction
1.2 dwinter 7:
1.1 dwinter 8: from Products.ZCTextIndex.ISplitter import ISplitter
9: from Products.ZCTextIndex.PipelineFactory import element_factory
10:
11: import re
12: from types import StringType
1.5 ! dwinter 13: import logging
! 14:
! 15: import PyLucene
1.1 dwinter 16:
17: def getSupportedEncoding(encodings):
18: for encoding in encodings:
19: try:
20: unicode('A', encoding)
21: return encoding
22: except:
23: pass
24: return 'utf-8'
25:
26:
27:
28: """beta of a fulltext splitter for cdli
29:
30: """
31: ignoreLines=['$','@','#','&']
32: separators=['']
1.2 dwinter 33: komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
1.5 ! dwinter 34: deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems
! 35: deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words
1.1 dwinter 36:
1.5 ! dwinter 37: class IndexLine(object):
! 38: """index a line with lucene"""
1.1 dwinter 39:
1.5 ! dwinter 40: def __init__(self, storeDir, analyzer,name,line,content):
! 41: logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
! 42: if not os.path.exists(storeDir):
! 43: os.mkdir(storeDir)
! 44: store = PyLucene.FSDirectory.getDirectory(storeDir, True)
! 45: writer = PyLucene.IndexWriter(store, analyzer, True)
! 46: writer.setMaxFieldLength(1048576)
! 47: self.indexDocs(writer,name,line,content)
! 48: writer.optimize()
! 49: writer.close()
! 50:
! 51: def indexDocs(self, writer,name,line,content):
! 52:
! 53: doc = PyLucene.Document()
! 54: doc.add(PyLucene.Field("name", pn,
! 55: PyLucene.Field.Store.YES,
! 56: PyLucene.Field.Index.UN_TOKENIZED))
! 57:
! 58: doc.add(PyLucene.Field("line", str(i),
! 59: PyLucene.Field.Store.YES,
! 60: PyLucene.Field.Index.UN_TOKENIZED))
! 61:
! 62:
! 63: doc.add(PyLucene.Field("contents", line,
! 64: PyLucene.Field.Store.YES,
! 65: PyLucene.Field.Index.TOKENIZED))
! 66:
! 67: writer.addDocument(doc)
! 68:
! 69: class cdliSplitter:
! 70: """basis class for splitter,
! 71: der Unterschied zwischen Word und Graphemesplitter
! 72: ist lediglich die unterschiedliche Auschliengsliste"""
! 73:
1.1 dwinter 74: default_encoding = "utf-8"
1.5 ! dwinter 75: delete=deleteGraphems
! 76: indexName="cdliSplitter"
! 77:
1.2 dwinter 78:
1.1 dwinter 79: def process(self, lst):
80: result = []
1.2 dwinter 81: pNum=None
82: lineNum=None
1.5 ! dwinter 83:
1.1 dwinter 84: for t in lst:
1.2 dwinter 85:
1.1 dwinter 86: t.replace("\r","\n")
87: for s in t.split("\n"):
1.2 dwinter 88:
1.1 dwinter 89: if type(s) is StringType: # not unicode
90: s = unicode(s, self.default_encoding, 'replace')
1.5 ! dwinter 91:
1.2 dwinter 92: if (s!="") and (s[0]=="&"): # store pNum
93: pNum=s[1:8]
1.5 ! dwinter 94: logging.debug("storing: %s"%pNum)
1.2 dwinter 95: elif (s!="") and (not (s[0] in ignoreLines)):
1.1 dwinter 96: splitted=s.split(".")
1.2 dwinter 97:
1.1 dwinter 98: if len(splitted)==1: #kein punkt
99: txt=splitted[0]
100: else:
101: txt=splitted[1]
1.2 dwinter 102: lineNum=splitted[0] #store line number
1.1 dwinter 103:
1.5 ! dwinter 104: analyse=txt
! 105: analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
! 106: analyse=re.sub(self.delete,' ',analyse) # deletions
1.2 dwinter 107:
1.5 ! dwinter 108: if self.indexName=="luceneSplitter":
! 109: if pNum:
! 110: analyser=PyLucene.StandardAnalyzer()
! 111: logging.error("calling lucene")
! 112:
! 113: IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
! 114: else:
! 115: splitted = analyse.split(" ")
! 116:
! 117:
! 118: for w in splitted:
! 119: w=w.lstrip().rstrip()
! 120:
! 121: if not (w==''):
! 122: if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
! 123:
! 124: Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
! 125: transaction.get().commit()
! 126:
! 127: result.append(w.lstrip().rstrip())
! 128: return result
1.2 dwinter 129:
130:
1.5 ! dwinter 131: class graphemeSplitter(cdliSplitter):
! 132: delete=deleteGraphems
! 133: indexName="graphemeSplitter"
! 134:
! 135: class wordSplitter(cdliSplitter):
! 136: delete=deleteWords
! 137: indexName="wordSplitter"
! 138:
! 139: class luceneSplitter(cdliSplitter):
! 140: delete=deleteWords
! 141: indexName="luceneSplitter"
! 142:
! 143:
! 144: try:
! 145: element_factory.registerFactory('Word Splitter',
! 146: 'CDLI grapheme splitter', graphemeSplitter)
! 147: except:
! 148: # in case the splitter is already registered, ValueError is raised
! 149: pass
1.2 dwinter 150:
1.5 ! dwinter 151: try:
! 152: element_factory.registerFactory('Word Splitter',
! 153: 'CDLI word splitter', wordSplitter)
! 154: except:
! 155: # in case the splitter is already registered, ValueError is raised
! 156: pass
1.1 dwinter 157:
158: try:
1.2 dwinter 159: element_factory.registerFactory('Word Splitter',
1.5 ! dwinter 160: 'CDLI lucene splitter', luceneSplitter)
1.1 dwinter 161: except:
162: # in case the splitter is already registered, ValueError is raised
163: pass
164: if __name__ == '__main__':
165: a = 'abc def我们的很 好。'
166: u = unicode(a, 'gbk')
167: s = authorSplitter()
168: print s.process([u])
169: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>