cdli/cdliSplitter.py - view

File: [Repository] / cdli / cdliSplitter.py
Revision 1.6: download - view: text, annotated - select for diffs - revision graph
Mon Apr 23 13:07:10 2007 UTC (17 years, 2 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

minorCVS: ----------------------------------------------------------------------

""" Author splitter """ import Zope2 import transaction from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re from types import StringType import logging try: import PyLucene except: print "no Lucene support" def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' """beta of a fulltext splitter for cdli """ ignoreLines=['$','@','#','&'] separators=[''] komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted deleteGraphems="\{|\}|<|>|$|$|-|_|\#|,|\||\]|\[|\!|\?" # for graphems deleteWords="<|>|$|$|_|\#|,|\||\]|\[|\!|\?"# for words class IndexLine(object): """index a line with lucene""" def __init__(self, storeDir, analyzer,name,line,content): logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) writer = PyLucene.IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(writer,name,line,content) writer.optimize() writer.close() def indexDocs(self, writer,name,line,content): doc = PyLucene.Document() doc.add(PyLucene.Field("name", pn, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add(PyLucene.Field("line", str(i), PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add(PyLucene.Field("contents", line, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) writer.addDocument(doc) class cdliSplitter: """basis class for splitter, der Unterschied zwischen Word und Graphemesplitter ist lediglich die unterschiedliche Auschlie吟ngsliste""" default_encoding = "utf-8" delete=deleteGraphems indexName="cdliSplitter" def process(self, lst): result = [] pNum=None lineNum=None for t in lst: t.replace("\r","\n") for s in t.split("\n"): if type(s) is StringType: # not unicode s = unicode(s, self.default_encoding, 'replace') if (s!="") and (s[0]=="&"): # store pNum pNum=s[1:8] logging.debug("storing: %s"%pNum) elif (s!="") and (not (s[0] in ignoreLines)): splitted=s.split(".") if len(splitted)==1: #kein punkt txt=splitted[0] else: txt=splitted[1] lineNum=splitted[0] #store line number analyse=txt analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems analyse=re.sub(self.delete,' ',analyse) # deletions if self.indexName=="luceneSplitter": if pNum: analyser=PyLucene.StandardAnalyzer() logging.error("calling lucene") IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) else: splitted = analyse.split(" ") for w in splitted: w=w.lstrip().rstrip() if not (w==''): if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) transaction.get().commit() result.append(w.lstrip().rstrip()) return result class graphemeSplitter(cdliSplitter): delete=deleteGraphems indexName="graphemeSplitter" class wordSplitter(cdliSplitter): delete=deleteWords indexName="wordSplitter" class luceneSplitter(cdliSplitter): delete=deleteWords indexName="luceneSplitter" try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass try: element_factory.registerFactory('Word Splitter', 'CDLI word splitter', wordSplitter) except: # in case the splitter is already registered, ValueError is raised pass try: element_factory.registerFactory('Word Splitter', 'CDLI lucene splitter', luceneSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def扂蠅腔竭疑﹝' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)