--- cdli/cdliSplitter.py 2007/02/08 12:00:23 1.4 +++ cdli/cdliSplitter.py 2007/03/21 19:29:23 1.5 @@ -10,6 +10,9 @@ from Products.ZCTextIndex.PipelineFactor import re from types import StringType +import logging + +import PyLucene def getSupportedEncoding(encodings): for encoding in encodings: @@ -28,22 +31,56 @@ def getSupportedEncoding(encodings): ignoreLines=['$','@','#','&'] separators=[''] komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted -delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems -#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words +deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems +deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words -class graphemeSplitter: +class IndexLine(object): + """index a line with lucene""" + def __init__(self, storeDir, analyzer,name,line,content): + logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) + if not os.path.exists(storeDir): + os.mkdir(storeDir) + store = PyLucene.FSDirectory.getDirectory(storeDir, True) + writer = PyLucene.IndexWriter(store, analyzer, True) + writer.setMaxFieldLength(1048576) + self.indexDocs(writer,name,line,content) + writer.optimize() + writer.close() + + def indexDocs(self, writer,name,line,content): + + doc = PyLucene.Document() + doc.add(PyLucene.Field("name", pn, + PyLucene.Field.Store.YES, + PyLucene.Field.Index.UN_TOKENIZED)) + + doc.add(PyLucene.Field("line", str(i), + PyLucene.Field.Store.YES, + PyLucene.Field.Index.UN_TOKENIZED)) + + + doc.add(PyLucene.Field("contents", line, + PyLucene.Field.Store.YES, + PyLucene.Field.Index.TOKENIZED)) + + writer.addDocument(doc) + +class cdliSplitter: + """basis class for splitter, + der Unterschied zwischen Word und Graphemesplitter + ist lediglich die unterschiedliche Auschliengsliste""" + default_encoding = "utf-8" + delete=deleteGraphems + indexName="cdliSplitter" + def process(self, lst): result = [] pNum=None lineNum=None - - - #print "LLLL",lst - - + for t in lst: t.replace("\r","\n") @@ -51,16 +88,11 @@ class graphemeSplitter: if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') - - #ignore lines - + if (s!="") and (s[0]=="&"): # store pNum pNum=s[1:8] - + logging.debug("storing: %s"%pNum) elif (s!="") and (not (s[0] in ignoreLines)): - - - #ignore everthing bevor "." splitted=s.split(".") if len(splitted)==1: #kein punkt @@ -69,26 +101,46 @@ class graphemeSplitter: txt=splitted[1] lineNum=splitted[0] #store line number - analyse=txt - + analyse=txt analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems - - analyse=re.sub(delete,' ',analyse) # deletions - - splitted = analyse.split(" ") - - for w in splitted: - w=w.lstrip().rstrip() - - if not (w==''): - if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline - Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) - transaction.get().commit() - - result.append(w.lstrip().rstrip()) + analyse=re.sub(self.delete,' ',analyse) # deletions + + if self.indexName=="luceneSplitter": + if pNum: + analyser=PyLucene.StandardAnalyzer() + logging.error("calling lucene") + + IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) + else: + splitted = analyse.split(" ") + + + for w in splitted: + w=w.lstrip().rstrip() + + if not (w==''): + if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline + + Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) + transaction.get().commit() + + result.append(w.lstrip().rstrip()) return result - + +class graphemeSplitter(cdliSplitter): + delete=deleteGraphems + indexName="graphemeSplitter" + +class wordSplitter(cdliSplitter): + delete=deleteWords + indexName="wordSplitter" + +class luceneSplitter(cdliSplitter): + delete=deleteWords + indexName="luceneSplitter" + + try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) @@ -96,6 +148,19 @@ except: # in case the splitter is already registered, ValueError is raised pass +try: + element_factory.registerFactory('Word Splitter', + 'CDLI word splitter', wordSplitter) +except: + # in case the splitter is already registered, ValueError is raised + pass + +try: + element_factory.registerFactory('Word Splitter', + 'CDLI lucene splitter', luceneSplitter) +except: + # in case the splitter is already registered, ValueError is raised + pass if __name__ == '__main__': a = 'abc def我们的很 好。' u = unicode(a, 'gbk')