--- cdli/cdliSplitter.py 2007/02/08 12:00:23 1.4 +++ cdli/cdliSplitter.py 2008/01/09 18:49:07 1.7.2.11 @@ -1,15 +1,11 @@ """ -Author splitter +CDLI word and grapheme splitter """ -import Zope2 -import transaction - -from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re -from types import StringType +import logging def getSupportedEncoding(encodings): for encoding in encodings: @@ -25,70 +21,98 @@ def getSupportedEncoding(encodings): """beta of a fulltext splitter for cdli """ -ignoreLines=['$','@','#','&'] +ignoreLines=['$','@','#','&','>'] separators=[''] -komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted -delete="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" # for graphems -#delete="<|>|\(|\)|_|\#|,|\||\]|\[|!|?" for words - -class graphemeSplitter: - +# kommas relevant for graphemes will not be deleted +komma_exception="([^sStThH])," +komma_exceptionex=re.compile(komma_exception) +# grapheme boundaries +#graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" +graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" +# for words +#wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" +wordBounds="_|,|\"" +wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" + +class cdliSplitter: + """base class for splitter. + the difference between word and grapheme splitter + is the word boundary list.""" + default_encoding = "utf-8" + bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) + ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) + indexName="cdliSplitter" + def process(self, lst): + """gets a list of strings and returns a list of words""" + + logging.debug("cdliSplitter: %s"%self.indexName) result = [] pNum=None lineNum=None - - - #print "LLLL",lst - - + for t in lst: - - t.replace("\r","\n") - for s in t.split("\n"): - - if type(s) is StringType: # not unicode - s = unicode(s, self.default_encoding, 'replace') - - #ignore lines - - if (s!="") and (s[0]=="&"): # store pNum - pNum=s[1:8] - - elif (s!="") and (not (s[0] in ignoreLines)): - - - #ignore everthing bevor "." - splitted=s.split(".") - - if len(splitted)==1: #kein punkt - txt=splitted[0] - else: - txt=splitted[1] - lineNum=splitted[0] #store line number - - analyse=txt - - analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems - - analyse=re.sub(delete,' ',analyse) # deletions - - splitted = analyse.split(" ") - - for w in splitted: - w=w.lstrip().rstrip() - - if not (w==''): - if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline - Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum)) - transaction.get().commit() + # normalise line breaks + t.replace("\r","\n") + # split lines + for s in t.split("\n"): + if isinstance(s, str): + # not unicode + s = unicode(s, self.default_encoding, 'replace') + + if (s!=''): + if s[0]=='&': + # store pNum + pNum=s[1:8] + logging.debug("%s processing: %s"%(self.indexName,pNum)) + + elif not (s[0] in ignoreLines): + # regular line + lineparts=s.split(".") + if len(lineparts)==1: + # no line number + txt=s + else: + #store line number + txt=lineparts[1] + lineNum=lineparts[0] + + # delete kommata except kommata relevant for graphemes + txt = komma_exceptionex.sub(r"\1",txt) + # replace word boundaries by spaces + txt = self.boundsex.sub(' ',txt) + # replace letters to be ignored + txt = self.ignorex.sub('',txt) + # split words + words = txt.split(" ") + for w in words: + w=w.strip() + if not (w==''): + result.append(w) - result.append(w.lstrip().rstrip()) + #logging.debug("split '%s' into %s"%(lst,repr(result))) return result - + +class graphemeSplitter(cdliSplitter): + bounds=graphemeBounds + boundsex=re.compile(graphemeBounds) + ignore=graphemeIgnore + ignorex=re.compile(graphemeIgnore) + indexName="graphemeSplitter" + +class wordSplitter(cdliSplitter): + bounds=wordBounds + boundsex=re.compile(wordBounds) + ignore=wordIgnore + ignorex=re.compile(wordIgnore) + indexName="wordSplitter" + try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) @@ -96,9 +120,10 @@ except: # in case the splitter is already registered, ValueError is raised pass -if __name__ == '__main__': - a = 'abc def我们的很 好。' - u = unicode(a, 'gbk') - s = authorSplitter() - print s.process([u]) - print s.process([u], 1) +try: + element_factory.registerFactory('Word Splitter', + 'CDLI word splitter', wordSplitter) +except: + # in case the splitter is already registered, ValueError is raised + pass +