--- cdli/cdliSplitter.py 2007/08/31 14:22:52 1.7 +++ cdli/cdliSplitter.py 2007/12/03 21:30:19 1.7.2.6 @@ -1,20 +1,11 @@ """ -Author splitter +CDLI word and grapheme splitter """ -import Zope2 -import transaction - -from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re -from types import StringType import logging -try: - import PyLucene -except: - print "no Lucene support" def getSupportedEncoding(encodings): for encoding in encodings: @@ -32,116 +23,82 @@ def getSupportedEncoding(encodings): """ ignoreLines=['$','@','#','&','>'] separators=[''] -komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted -deleteGraphems="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems -deleteWords="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"# for words - -class IndexLine(object): - """index a line with lucene""" - - def __init__(self, storeDir, analyzer,name,line,content): - logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content))) - if not os.path.exists(storeDir): - os.mkdir(storeDir) - store = PyLucene.FSDirectory.getDirectory(storeDir, True) - writer = PyLucene.IndexWriter(store, analyzer, True) - writer.setMaxFieldLength(1048576) - self.indexDocs(writer,name,line,content) - writer.optimize() - writer.close() - - def indexDocs(self, writer,name,line,content): - - doc = PyLucene.Document() - doc.add(PyLucene.Field("name", pn, - PyLucene.Field.Store.YES, - PyLucene.Field.Index.UN_TOKENIZED)) - - doc.add(PyLucene.Field("line", str(i), - PyLucene.Field.Store.YES, - PyLucene.Field.Index.UN_TOKENIZED)) - - - doc.add(PyLucene.Field("contents", line, - PyLucene.Field.Store.YES, - PyLucene.Field.Index.TOKENIZED)) - - writer.addDocument(doc) +# kommas relevant for graphemes will not be deleted +komma_exception="([^sStThH])," +# grapheme boundaries +#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" +# for words +#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" +wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\"" + class cdliSplitter: - """basis class for splitter, - der Unterschied zwischen Word und Graphemesplitter - ist lediglich die unterschiedliche Auschliengsliste""" + """base class for splitter. + the difference between word and grapheme splitter + is the word boundary list.""" default_encoding = "utf-8" - delete=deleteGraphems + bounds=graphemeBounds indexName="cdliSplitter" def process(self, lst): + """gets a list of strings and returns a list of words""" + + logging.debug("cdliSplitter: %s"%self.indexName) result = [] pNum=None lineNum=None for t in lst: - - t.replace("\r","\n") - for s in t.split("\n"): - - if type(s) is StringType: # not unicode - s = unicode(s, self.default_encoding, 'replace') - - if (s!="") and (s[0]=="&"): # store pNum - pNum=s[1:8] - logging.debug("storing: %s"%pNum) - elif (s!="") and (not (s[0] in ignoreLines)): - splitted=s.split(".") - - if len(splitted)==1: #kein punkt - txt=splitted[0] - else: - txt=splitted[1] - lineNum=splitted[0] #store line number - - analyse=txt - analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems - analyse=re.sub(self.delete,' ',analyse) # deletions - - if self.indexName=="luceneSplitter": - if pNum: - analyser=PyLucene.StandardAnalyzer() - logging.error("calling lucene") + # normalise line breaks + t.replace("\r","\n") + # split lines + for s in t.split("\n"): + if isinstance(s, str): + # not unicode + s = unicode(s, self.default_encoding, 'replace') + + if (s!=''): + if s[0]=='&': + # store pNum + pNum=s[1:8] + logging.debug("%s processing: %s"%(self.indexName,pNum)) - IndexLine("/tmp/index",analyser,pNum,lineNum,analyse) - else: - splitted = analyse.split(" ") - - - for w in splitted: - w=w.lstrip().rstrip() - - if not (w==''): - if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline - - Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum)) - transaction.get().commit() - - result.append(w.lstrip().rstrip()) + elif not (s[0] in ignoreLines): + # regular line + lineparts=s.split(".") + if len(lineparts)==1: + # no line number + txt=s + else: + #store line number + txt=lineparts[1] + lineNum=lineparts[0] + + # delete kommata except kommata relevant for graphemes + txt = re.sub(komma_exception,r"\1",txt) + # replace word boundaries by spaces + txt = re.sub(self.bounds,' ',txt) + # split words + words = txt.split(" ") + for w in words: + w=w.strip() + if not (w==''): + result.append(w) + + logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): - delete=deleteGraphems + bounds=graphemeBounds indexName="graphemeSplitter" class wordSplitter(cdliSplitter): - delete=deleteWords + bounds=wordBounds indexName="wordSplitter" - -class luceneSplitter(cdliSplitter): - delete=deleteWords - indexName="luceneSplitter" - try: element_factory.registerFactory('Word Splitter', @@ -157,15 +114,3 @@ except: # in case the splitter is already registered, ValueError is raised pass -try: - element_factory.registerFactory('Word Splitter', - 'CDLI lucene splitter', luceneSplitter) -except: - # in case the splitter is already registered, ValueError is raised - pass -if __name__ == '__main__': - a = 'abc def我们的很 好。' - u = unicode(a, 'gbk') - s = authorSplitter() - print s.process([u]) - print s.process([u], 1)