File:  [Repository] / ECHO_content / authorSplitter.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Mon Feb 15 19:03:28 2010 UTC (14 years, 3 months ago) by casties
Branches: MAIN
CVS tags: cleanup, Root_cleanup, HEAD
fixing small errors for zope 2.12

    1: """
    2: Author splitter
    3: """
    4: 
    5: from Products.ZCTextIndex.ISplitter import ISplitter
    6: from Products.ZCTextIndex.PipelineFactory import element_factory
    7: 
    8: import re
    9: from types import StringType
   10: 
   11: def getSupportedEncoding(encodings):
   12:     for encoding in encodings:
   13:         try:
   14:             unicode('A', encoding)
   15:             return encoding
   16:         except:
   17:             pass
   18:     return 'utf-8'
   19: 
   20: # CJK charsets ranges, see this following pages:
   21: #
   22: # http://jrgraphix.net/research/unicode_blocks.php?block=87
   23: # http://jrgraphix.net/research/unicode_blocks.php?block=85
   24: # http://jrgraphix.net/research/unicode_blocks.php?block=95
   25: # http://jrgraphix.net/research/unicode_blocks.php?block=76
   26: # http://jrgraphix.net/research/unicode_blocks.php?block=90
   27: 
   28: """beta of a author splitter for echo
   29: RULE is NAME1;NAME2 generates a single Lexicon entry for every name
   30: 
   31: """
   32: class authorSplitter:
   33: 
   34:     default_encoding = "utf-8"
   35: 
   36:     def process(self, lst):
   37:         result = []
   38:        
   39:         for s in lst:
   40:             if type(s) is StringType: # not unicode
   41:                 s = unicode(s, self.default_encoding, 'replace')
   42:                 
   43:             splitted = s.split(";")
   44:            
   45:             for w in splitted:
   46:                     if not (w[0:2]=="!!"): #don't index !!NOT USED....
   47:                         result.append(w.lstrip().rstrip())
   48:         return result
   49: 
   50:  
   51: try:
   52:     element_factory.registerFactory('Word Splitter',
   53:           'ECHO author splitter', authorSplitter)
   54: except:
   55:     # in case the splitter is already registered, ValueError is raised
   56:     pass
   57: 
   58: if __name__ == '__main__':
   59:    a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4\U00DC\U00C3\U00A1\U00A3'
   60:    u = unicode(a, 'gbk')
   61:    s = authorSplitter()
   62:    print s.process([u])
   63:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>