ECHO_content/authorSplitter.py - annotate

Return to authorSplitter.py CVS log
Up to [Repository] / ECHO_content
Annotation of ECHO_content/authorSplitter.py, revision 1.1

1.1     ! dwinter     1: """
        !             2: Author splitter
        !             3: """
        !             4: 
        !             5: from Products.ZCTextIndex.ISplitter import ISplitter
        !             6: from Products.ZCTextIndex.PipelineFactory import element_factory
        !             7: 
        !             8: import re
        !             9: from types import StringType
        !            10: 
        !            11: def getSupportedEncoding(encodings):
        !            12:     for encoding in encodings:
        !            13:         try:
        !            14:             unicode('A', encoding)
        !            15:             return encoding
        !            16:         except:
        !            17:             pass
        !            18:     return 'utf-8'
        !            19: 
        !            20: # CJK charsets ranges, see this following pages:
        !            21: #
        !            22: # http://jrgraphix.net/research/unicode_blocks.php?block=87
        !            23: # http://jrgraphix.net/research/unicode_blocks.php?block=85
        !            24: # http://jrgraphix.net/research/unicode_blocks.php?block=95
        !            25: # http://jrgraphix.net/research/unicode_blocks.php?block=76
        !            26: # http://jrgraphix.net/research/unicode_blocks.php?block=90
        !            27: 
        !            28: """beta of a author splitter for echo
        !            29: RULE is NAME1;NAME2 generates a single Lexicon entry for every name
        !            30: 
        !            31: """
        !            32: class authorSplitter:
        !            33: 
        !            34:     default_encoding = "utf-8"
        !            35: 
        !            36:     def process(self, lst):
        !            37:         result = []
        !            38:        
        !            39:         for s in lst:
        !            40:             if type(s) is StringType: # not unicode
        !            41:                 s = unicode(s, self.default_encoding, 'replace')
        !            42:                 
        !            43:             splitted = s.split(";")
        !            44:            
        !            45:             for w in splitted:
        !            46:                     result.append(w.lstrip().rstrip())
        !            47:         return result
        !            48: 
        !            49:  
        !            50: try:
        !            51:     element_factory.registerFactory('Word Splitter',
        !            52:           'ECHO author splitter', authorSplitter)
        !            53: except:
        !            54:     # in case the splitter is already registered, ValueError is raised
        !            55:     pass
        !            56: 
        !            57: if __name__ == '__main__':
        !            58:    a = 'abc def我们的很 好。'
        !            59:    u = unicode(a, 'gbk')
        !            60:    s = authorSplitter()
        !            61:    print s.process([u])
        !            62:    print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>