ECHO_content/authorSplitter.py - view

Return to authorSplitter.py CVS log

Up to [Repository] / ECHO_content

File: [Repository] / ECHO_content / authorSplitter.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Mon Feb 15 19:03:28 2010 UTC (14 years, 4 months ago) by casties
Branches: MAIN
CVS tags: cleanup, Root_cleanup, HEAD

fixing small errors for zope 2.12

"""
Author splitter
"""

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

import re
from types import StringType

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'

# CJK charsets ranges, see this following pages:
#
# http://jrgraphix.net/research/unicode_blocks.php?block=87
# http://jrgraphix.net/research/unicode_blocks.php?block=85
# http://jrgraphix.net/research/unicode_blocks.php?block=95
# http://jrgraphix.net/research/unicode_blocks.php?block=76
# http://jrgraphix.net/research/unicode_blocks.php?block=90

"""beta of a author splitter for echo
RULE is NAME1;NAME2 generates a single Lexicon entry for every name

"""
class authorSplitter:

    default_encoding = "utf-8"

    def process(self, lst):
        result = []
       
        for s in lst:
            if type(s) is StringType: # not unicode
                s = unicode(s, self.default_encoding, 'replace')
                
            splitted = s.split(";")
           
            for w in splitted:
                    if not (w[0:2]=="!!"): #don't index !!NOT USED....
                        result.append(w.lstrip().rstrip())
        return result

 
try:
    element_factory.registerFactory('Word Splitter',
          'ECHO author splitter', authorSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

if __name__ == '__main__':
   a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4\U00DC\U00C3\U00A1\U00A3'
   u = unicode(a, 'gbk')
   s = authorSplitter()
   print s.process([u])
   print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>