File:  [Repository] / cdli / cdliSplitter.py
Revision 1.7.2.2: download - view: text, annotated - select for diffs - revision graph
Fri Oct 19 16:25:07 2007 UTC (16 years, 8 months ago) by casties
Branches: zcat_only_1
Diff to: branchpoint 1.7: preferred, unified
removed more remains of lineIndex
changed delete to really delete


ASSIGNED - # 36: change fulltext index and search to pure ZCatalog
https://itgroup.mpiwg-berlin.mpg.de:8080/tracs/cdli/ticket/36

    1: """
    2: CDLI word and grapheme splitter
    3: """
    4: 
    5: from Products.ZCTextIndex.PipelineFactory import element_factory
    6: 
    7: import re
    8: import logging
    9: 
   10: def getSupportedEncoding(encodings):
   11:     for encoding in encodings:
   12:         try:
   13:             unicode('A', encoding)
   14:             return encoding
   15:         except:
   16:             pass
   17:     return 'utf-8'
   18: 
   19: 
   20: 
   21: """beta of a fulltext splitter for cdli
   22: 
   23: """
   24: ignoreLines=['$','@','#','&','>']
   25: separators=['']
   26: # kommas relevant for graphemes will not be deleted
   27: komma_exception="([^sStThH])," 
   28: # grapheme boundaries
   29: graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?"
   30: # for words 
   31: wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?"
   32: 
   33:            
   34: class cdliSplitter:
   35:     """base class for splitter. 
   36:     the difference between word and grapheme splitter 
   37:     is the word boundary list."""
   38:     
   39:     default_encoding = "utf-8"
   40:     bounds=graphemeBounds
   41:     indexName="cdliSplitter"
   42:     
   43:     
   44:     def process(self, lst):
   45:         """gets a list of strings and returns a list of words"""
   46:         
   47:         logging.debug("cdliSplitter") 
   48:         result = []
   49:         pNum=None
   50:         lineNum=None
   51:     
   52:         for t in lst:
   53:             # normalise line breaks
   54:             t.replace("\r","\n")
   55:             # split lines
   56:             for s in t.split("\n"):
   57:                 if isinstance(s, str): 
   58:                     # not unicode
   59:                     s = unicode(s, self.default_encoding, 'replace')
   60:          
   61:                 if (s!=''):
   62:                     if s[0]=='&': 
   63:                         # store pNum
   64:                         pNum=s[1:8]
   65:                         logging.debug("%s processing: %s"%(indexName,pNum))
   66:                         
   67:                     elif not (s[0] in ignoreLines):
   68:                         # regular line
   69:                         lineparts=s.split(".")
   70:                         if len(lineparts)==1: 
   71:                             # no line number
   72:                             txt=s
   73:                         else:
   74:                             #store line number
   75:                             txt=lineparts[1]
   76:                             lineNum=lineparts[0] 
   77:                             
   78:                         # delete kommata except kommata relevant for graphemes
   79:                         txt = re.sub(komma_exception,r"\1",txt)
   80:                         # replace word boundaries by spaces
   81:                         txt = re.sub(self.bounds,' ',txt)
   82:                         # split words
   83:                         words = txt.split(" ")
   84:                         for w in words:
   85:                             w=w.strip()
   86:                             if not (w==''):
   87:                                 result.append(w)
   88: 
   89:         return result
   90: 
   91: 
   92: class graphemeSplitter(cdliSplitter):
   93:     bounds=graphemeBounds
   94:     indexName="graphemeSplitter"
   95:     
   96: class wordSplitter(cdliSplitter):
   97:     bounds=wordBounds
   98:     indexName="wordSplitter"
   99:       
  100: try:
  101:     element_factory.registerFactory('Word Splitter',
  102:           'CDLI grapheme splitter', graphemeSplitter)
  103: except:
  104:     # in case the splitter is already registered, ValueError is raised
  105:     pass
  106: 
  107: try:
  108:     element_factory.registerFactory('Word Splitter',
  109:           'CDLI word splitter', wordSplitter)
  110: except:
  111:     # in case the splitter is already registered, ValueError is raised
  112:     pass
  113: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>