Annotation of ECHO_content/authorSplitter.py, revision 1.1
1.1 ! dwinter 1: """
! 2: Author splitter
! 3: """
! 4:
! 5: from Products.ZCTextIndex.ISplitter import ISplitter
! 6: from Products.ZCTextIndex.PipelineFactory import element_factory
! 7:
! 8: import re
! 9: from types import StringType
! 10:
! 11: def getSupportedEncoding(encodings):
! 12: for encoding in encodings:
! 13: try:
! 14: unicode('A', encoding)
! 15: return encoding
! 16: except:
! 17: pass
! 18: return 'utf-8'
! 19:
! 20: # CJK charsets ranges, see this following pages:
! 21: #
! 22: # http://jrgraphix.net/research/unicode_blocks.php?block=87
! 23: # http://jrgraphix.net/research/unicode_blocks.php?block=85
! 24: # http://jrgraphix.net/research/unicode_blocks.php?block=95
! 25: # http://jrgraphix.net/research/unicode_blocks.php?block=76
! 26: # http://jrgraphix.net/research/unicode_blocks.php?block=90
! 27:
! 28: """beta of a author splitter for echo
! 29: RULE is NAME1;NAME2 generates a single Lexicon entry for every name
! 30:
! 31: """
! 32: class authorSplitter:
! 33:
! 34: default_encoding = "utf-8"
! 35:
! 36: def process(self, lst):
! 37: result = []
! 38:
! 39: for s in lst:
! 40: if type(s) is StringType: # not unicode
! 41: s = unicode(s, self.default_encoding, 'replace')
! 42:
! 43: splitted = s.split(";")
! 44:
! 45: for w in splitted:
! 46: result.append(w.lstrip().rstrip())
! 47: return result
! 48:
! 49:
! 50: try:
! 51: element_factory.registerFactory('Word Splitter',
! 52: 'ECHO author splitter', authorSplitter)
! 53: except:
! 54: # in case the splitter is already registered, ValueError is raised
! 55: pass
! 56:
! 57: if __name__ == '__main__':
! 58: a = 'abc def我们的很 好。'
! 59: u = unicode(a, 'gbk')
! 60: s = authorSplitter()
! 61: print s.process([u])
! 62: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>