""" CDLI word and grapheme splitter """ from Products.ZCTextIndex.PipelineFactory import element_factory import re import logging def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' """beta of a fulltext splitter for cdli """ ignoreLines=['$','@','#','&','>'] separators=[''] # kommas relevant for graphemes will not be deleted komma_exception="([^sStThH])," komma_exceptionex=re.compile(komma_exception) # grapheme boundaries #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" # for words #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" wordBounds="_|,|\"" wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" class cdliSplitter: """base class for splitter. the difference between word and grapheme splitter is the word boundary list.""" default_encoding = "utf-8" bounds=graphemeBounds boundsex=re.compile(graphemeBounds) ignore=graphemeIgnore ignorex=re.compile(graphemeIgnore) indexName="cdliSplitter" def process(self, lst): """gets a list of strings and returns a list of words""" logging.debug("cdliSplitter: %s"%self.indexName) result = [] pNum=None lineNum=None for t in lst: # normalise line breaks t.replace("\r","\n") # split lines for s in t.split("\n"): if isinstance(s, str): # not unicode s = unicode(s, self.default_encoding, 'replace') if (s!=''): if s[0]=='&': # store pNum pNum=s[1:8] logging.debug("%s processing: %s"%(self.indexName,pNum)) elif not (s[0] in ignoreLines): # regular line lineparts=s.split(".") if len(lineparts)==1: # no line number txt=s else: #store line number txt=lineparts[1] lineNum=lineparts[0] # delete kommata except kommata relevant for graphemes txt = komma_exceptionex.sub(r"\1",txt) # replace word boundaries by spaces txt = self.boundsex.sub(' ',txt) # replace letters to be ignored txt = self.ignorex.sub('',txt) # split words words = txt.split(" ") for w in words: w=w.strip() if not (w==''): result.append(w) #logging.debug("split '%s' into %s"%(lst,repr(result))) return result class graphemeSplitter(cdliSplitter): bounds=graphemeBounds boundsex=re.compile(graphemeBounds) ignore=graphemeIgnore ignorex=re.compile(graphemeIgnore) indexName="graphemeSplitter" class wordSplitter(cdliSplitter): bounds=wordBounds boundsex=re.compile(wordBounds) ignore=wordIgnore ignorex=re.compile(wordIgnore) indexName="wordSplitter" try: element_factory.registerFactory('Word Splitter', 'CDLI grapheme splitter', graphemeSplitter) except: # in case the splitter is already registered, ValueError is raised pass try: element_factory.registerFactory('Word Splitter', 'CDLI word splitter', wordSplitter) except: # in case the splitter is already registered, ValueError is raised pass