Annotation of ECHO_content/authorSplitter.py, revision 1.2

1.1       dwinter     1: """
                      2: Author splitter
                      3: """
                      4: 
                      5: from Products.ZCTextIndex.ISplitter import ISplitter
                      6: from Products.ZCTextIndex.PipelineFactory import element_factory
                      7: 
                      8: import re
                      9: from types import StringType
                     10: 
                     11: def getSupportedEncoding(encodings):
                     12:     for encoding in encodings:
                     13:         try:
                     14:             unicode('A', encoding)
                     15:             return encoding
                     16:         except:
                     17:             pass
                     18:     return 'utf-8'
                     19: 
                     20: # CJK charsets ranges, see this following pages:
                     21: #
                     22: # http://jrgraphix.net/research/unicode_blocks.php?block=87
                     23: # http://jrgraphix.net/research/unicode_blocks.php?block=85
                     24: # http://jrgraphix.net/research/unicode_blocks.php?block=95
                     25: # http://jrgraphix.net/research/unicode_blocks.php?block=76
                     26: # http://jrgraphix.net/research/unicode_blocks.php?block=90
                     27: 
                     28: """beta of a author splitter for echo
                     29: RULE is NAME1;NAME2 generates a single Lexicon entry for every name
                     30: 
                     31: """
                     32: class authorSplitter:
                     33: 
                     34:     default_encoding = "utf-8"
                     35: 
                     36:     def process(self, lst):
                     37:         result = []
                     38:        
                     39:         for s in lst:
                     40:             if type(s) is StringType: # not unicode
                     41:                 s = unicode(s, self.default_encoding, 'replace')
                     42:                 
                     43:             splitted = s.split(";")
                     44:            
                     45:             for w in splitted:
1.2     ! dwinter    46:                     if not (w[0:2]=="!!"): #don't index !!NOT USED....
        !            47:                         result.append(w.lstrip().rstrip())
1.1       dwinter    48:         return result
                     49: 
                     50:  
                     51: try:
                     52:     element_factory.registerFactory('Word Splitter',
                     53:           'ECHO author splitter', authorSplitter)
                     54: except:
                     55:     # in case the splitter is already registered, ValueError is raised
                     56:     pass
                     57: 
                     58: if __name__ == '__main__':
                     59:    a = 'abc def我们的很 好。'
                     60:    u = unicode(a, 'gbk')
                     61:    s = authorSplitter()
                     62:    print s.process([u])
                     63:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>