File:  [Repository] / cdli / cdliSplitter.py
Revision 1.9: download - view: text, annotated - select for diffs - revision graph
Thu Sep 25 12:37:55 2008 UTC (15 years, 9 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
erste version fŸr neues basket managment

    1: """
    2: CDLI word and grapheme splitter
    3: """
    4: 
    5: from Products.ZCTextIndex.PipelineFactory import element_factory
    6: 
    7: import re
    8: import logging
    9: 
   10: def getSupportedEncoding(encodings):
   11:     for encoding in encodings:
   12:         try:
   13:             unicode('A', encoding)
   14:             return encoding
   15:         except:
   16:             pass
   17:     return 'utf-8'
   18: 
   19: 
   20: 
   21: """beta of a fulltext splitter for cdli
   22: 
   23: """
   24: ignoreLines=['$','@','#','&','>']
   25: separators=['']
   26: # kommas relevant for graphemes will not be deleted
   27: komma_exception="([^sStThH]),"
   28: komma_exceptionex=re.compile(komma_exception)
   29: # grapheme boundaries
   30: #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
   31: graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\""
   32: graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
   33: # for words 
   34: #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
   35: wordBounds="_|,|\""
   36: wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;"
   37:            
   38: class cdliSplitter:
   39: 
   40:     """base class for splitter. 
   41:     the difference between word and grapheme splitter 
   42:     is the word boundary list."""
   43: 
   44:     
   45:     default_encoding = "utf-8"
   46:     bounds=graphemeBounds
   47:     boundsex=re.compile(graphemeBounds)
   48:     ignore=graphemeIgnore
   49:     ignorex=re.compile(graphemeIgnore)
   50:     indexName="cdliSplitter"
   51:     
   52:     
   53:     def process(self, lst):
   54:         """gets a list of strings and returns a list of words"""
   55:         
   56:         logging.debug("cdliSplitter: %s"%self.indexName) 
   57:         result = []
   58:         pNum=None
   59:         lineNum=None
   60:     
   61:         for t in lst:
   62:             # normalise line breaks
   63:             t.replace("\r","\n")
   64:             # split lines
   65:             for s in t.split("\n"):
   66:                 if isinstance(s, str): 
   67:                     # not unicode
   68:                     s = unicode(s, self.default_encoding, 'replace')
   69:          
   70:                 if (s!=''):
   71:                     if s[0]=='&': 
   72:                         # store pNum
   73:                         pNum=s[1:8]
   74:                         logging.debug("%s processing: %s"%(self.indexName,pNum))
   75:                         
   76:                     elif not (s[0] in ignoreLines):
   77:                         # regular line
   78:                         lineparts=s.split(". ",1)
   79:                         if len(lineparts)==1: 
   80:                             # no line number
   81:                             txt=s
   82:                         else:
   83:                             #store line number
   84:                             txt=lineparts[1]
   85:                             lineNum=lineparts[0] 
   86:                             
   87:                         # delete kommata except kommata relevant for graphemes
   88:                         txt = komma_exceptionex.sub(r"\1",txt)
   89:                         # replace word boundaries by spaces
   90:                         txt = self.boundsex.sub(' ',txt)
   91:                         # replace letters to be ignored
   92:                         txt = self.ignorex.sub('',txt)
   93:                         # split words
   94:                         words = txt.split(" ")
   95:                         for w in words:
   96:                             w=w.strip()
   97:                             if not (w==''):
   98:                                 result.append(w)
   99: 
  100:         #logging.debug("split '%s' into %s"%(lst,repr(result)))
  101:         return result
  102: 
  103: 
  104: class graphemeSplitter(cdliSplitter):
  105:     bounds=graphemeBounds
  106:     boundsex=re.compile(graphemeBounds)
  107:     ignore=graphemeIgnore
  108:     ignorex=re.compile(graphemeIgnore)
  109:     indexName="graphemeSplitter"
  110:     
  111: class wordSplitter(cdliSplitter):
  112:     bounds=wordBounds
  113:     boundsex=re.compile(wordBounds)
  114:     ignore=wordIgnore
  115:     ignorex=re.compile(wordIgnore)
  116:     indexName="wordSplitter"
  117:       
  118: try:
  119:     element_factory.registerFactory('Word Splitter',
  120:           'CDLI grapheme splitter', graphemeSplitter)
  121: except:
  122:     # in case the splitter is already registered, ValueError is raised
  123:     pass
  124: 
  125: try:
  126:     element_factory.registerFactory('Word Splitter',
  127:           'CDLI word splitter', wordSplitter)
  128: except:
  129:     # in case the splitter is already registered, ValueError is raised
  130:     pass
  131: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>