Annotation of MPIWGWeb/nameSplitter.py, revision 1.1.2.1

1.1.2.1 ! dwinter     1: """
        !             2: name splitter
        !             3: """
        !             4: 
        !             5: from Products.ZCTextIndex.ISplitter import ISplitter
        !             6: from Products.ZCTextIndex.PipelineFactory import element_factory
        !             7: 
        !             8: import psycopg
        !             9: 
        !            10: import re
        !            11: from types import StringType
        !            12: 
        !            13: def getSupportedEncoding(encodings):
        !            14:     for encoding in encodings:
        !            15:         try:
        !            16:             unicode('A', encoding)
        !            17:             return encoding
        !            18:         except:
        !            19:             pass
        !            20:     return 'utf-8'
        !            21: 
        !            22: # CJK charsets ranges, see this following pages:
        !            23: #
        !            24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
        !            25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
        !            26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
        !            27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
        !            28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
        !            29: 
        !            30: """
        !            31: splitter for lastnames in database
        !            32: 
        !            33: """
        !            34: 
        !            35: def quote(str):
        !            36:     str=str.replace("'","\\\'")
        !            37:     return str
        !            38: class nameSplitter:
        !            39: 
        !            40:     default_encoding = "utf-8"
        !            41: 
        !            42:     def process(self, lsttmp):
        !            43:         result = []
        !            44:         o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
        !            45:         c = o.cursor() 
        !            46:         replaceStr="<>;.:()"
        !            47:         lst=" ".join(lsttmp)
        !            48:         for x in replaceStr:
        !            49:             lst=lst.replace(x," ")
        !            50:         
        !            51:         for s in lst.split():
        !            52:             if type(s) is not StringType: # not unicode
        !            53:                 s = s.decode(self.default_encoding)
        !            54:             if s not in result: # check for database entry
        !            55:                 c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
        !            56:                 if c.fetchone():
        !            57:                     print "found",s
        !            58:                     result.append(lastname)
        !            59:         return result
        !            60: 
        !            61:  
        !            62: try:
        !            63:     element_factory.registerFactory('Word Splitter',
        !            64:           'MPIWG Name Splitter', nameSplitter)
        !            65: except:
        !            66:     # in case the splitter is already registered, ValueError is raised
        !            67:     pass
        !            68: 
        !            69: if __name__ == '__main__':
        !            70:    a = 'abc def我们的很 好。'
        !            71:    u = unicode(a, 'gbk')
        !            72:    s = authorSplitter()
        !            73:    print s.process([u])
        !            74:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>