--- cdli/cdliSplitter.py 2006/11/14 17:02:59 1.1 +++ cdli/cdliSplitter.py 2007/10/26 22:45:12 1.7.2.5 @@ -1,12 +1,11 @@ """ -Author splitter +CDLI word and grapheme splitter """ -from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import re -from types import StringType +import logging def getSupportedEncoding(encodings): for encoding in encodings: @@ -22,63 +21,94 @@ def getSupportedEncoding(encodings): """beta of a fulltext splitter for cdli """ -ignoreLines=['$','@','#','&'] +ignoreLines=['$','@','#','&','>'] separators=[''] -delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\[" - -class graphemeSplitter: - +# kommas relevant for graphemes will not be deleted +komma_exception="([^sStThH])," +# grapheme boundaries +graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" +# for words +wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" + + +class cdliSplitter: + """base class for splitter. + the difference between word and grapheme splitter + is the word boundary list.""" + default_encoding = "utf-8" - + bounds=graphemeBounds + indexName="cdliSplitter" + + def process(self, lst): + """gets a list of strings and returns a list of words""" + + logging.debug("cdliSplitter: %s"%self.indexName) result = [] - + pNum=None + lineNum=None + for t in lst: + # normalise line breaks + t.replace("\r","\n") + # split lines + for s in t.split("\n"): + if isinstance(s, str): + # not unicode + s = unicode(s, self.default_encoding, 'replace') + + if (s!=''): + if s[0]=='&': + # store pNum + pNum=s[1:8] + logging.debug("%s processing: %s"%(self.indexName,pNum)) + + elif not (s[0] in ignoreLines): + # regular line + lineparts=s.split(".") + if len(lineparts)==1: + # no line number + txt=s + else: + #store line number + txt=lineparts[1] + lineNum=lineparts[0] + + # delete kommata except kommata relevant for graphemes + txt = re.sub(komma_exception,r"\1",txt) + # replace word boundaries by spaces + txt = re.sub(self.bounds,' ',txt) + # split words + words = txt.split(" ") + for w in words: + w=w.strip() + if not (w==''): + result.append(w) - t.replace("\r","\n") - for s in t.split("\n"): - - if type(s) is StringType: # not unicode - s = unicode(s, self.default_encoding, 'replace') - - #ignore lines - - if (s!="") and (not (s[0] in ignoreLines)): - - #ignore everthing bevor "." - splitted=s.split(".") - - if len(splitted)==1: #kein punkt - txt=splitted[0] - else: - txt=splitted[1] - - analyse=txt - - analyse=re.sub(delete,' ',analyse) # deletions - - splitted = analyse.split(" ") - - for w in splitted: - w=w.lstrip().rstrip() - if not (w==''): - print repr(w) - result.append(w.lstrip().rstrip()) + #logging.debug("split '%s' into %s"%(lst,repr(result))) return result -element_factory.registerFactory('Word Splitter', - 'CDLI grapheme splitter', graphemeSplitter) - + +class graphemeSplitter(cdliSplitter): + bounds=graphemeBounds + indexName="graphemeSplitter" + +class wordSplitter(cdliSplitter): + bounds=wordBounds + indexName="wordSplitter" + try: - element_factory.registerFactory('graphemeSplitter', + element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass -if __name__ == '__main__': - a = 'abc def我们的很 好。' - u = unicode(a, 'gbk') - s = authorSplitter() - print s.process([u]) - print s.process([u], 1) +try: + element_factory.registerFactory('Word Splitter', + 'CDLI word splitter', wordSplitter) +except: + # in case the splitter is already registered, ValueError is raised + pass +