File:  [Repository] / cdli / cdliSplitter.py
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Tue Nov 14 17:02:59 2006 UTC (17 years, 7 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
showview and splitter added

    1: """
    2: Author splitter
    3: """
    4: 
    5: from Products.ZCTextIndex.ISplitter import ISplitter
    6: from Products.ZCTextIndex.PipelineFactory import element_factory
    7: 
    8: import re
    9: from types import StringType
   10: 
   11: def getSupportedEncoding(encodings):
   12:     for encoding in encodings:
   13:         try:
   14:             unicode('A', encoding)
   15:             return encoding
   16:         except:
   17:             pass
   18:     return 'utf-8'
   19: 
   20: 
   21: 
   22: """beta of a fulltext splitter for cdli
   23: 
   24: """
   25: ignoreLines=['$','@','#','&']
   26: separators=['']
   27: delete="{|}|<|>|\(|\)|-|_|\#|,|\~|\||\]|\["
   28: 
   29: class graphemeSplitter:
   30: 
   31:     default_encoding = "utf-8"
   32: 
   33:     def process(self, lst):
   34:         result = []
   35:        
   36:         for t in lst:
   37: 
   38:          t.replace("\r","\n")
   39:          for s in t.split("\n"):
   40:        
   41:             if type(s) is StringType: # not unicode
   42:                 s = unicode(s, self.default_encoding, 'replace')
   43:             
   44:             #ignore lines
   45:             
   46:             if (s!="") and (not (s[0] in ignoreLines)):
   47:               
   48:                 #ignore everthing bevor "."
   49:                 splitted=s.split(".")
   50:                 
   51:                 if len(splitted)==1: #kein punkt
   52:                     txt=splitted[0]
   53:                 else:
   54:                     txt=splitted[1]
   55:                 
   56:                 analyse=txt
   57:     
   58:                 analyse=re.sub(delete,' ',analyse) # deletions
   59:                 
   60:                 splitted = analyse.split(" ")
   61:                
   62:                 for w in splitted:
   63:                     w=w.lstrip().rstrip()
   64:                     if not (w==''):
   65:                         print repr(w)
   66:                         result.append(w.lstrip().rstrip())
   67:         return result
   68: 
   69: element_factory.registerFactory('Word Splitter',
   70:           'CDLI grapheme splitter', graphemeSplitter)
   71:  
   72: try:
   73:     element_factory.registerFactory('graphemeSplitter',
   74:           'CDLI grapheme splitter', graphemeSplitter)
   75: except:
   76:     # in case the splitter is already registered, ValueError is raised
   77:     pass
   78: 
   79: if __name__ == '__main__':
   80:    a = 'abc def我们的很 好。'
   81:    u = unicode(a, 'gbk')
   82:    s = authorSplitter()
   83:    print s.process([u])
   84:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>