Annotation of MPIWGWeb/nameSplitter.py, revision 1.1.2.4

1.1.2.1   dwinter     1: """
                      2: name splitter
                      3: """
                      4: 
                      5: from Products.ZCTextIndex.ISplitter import ISplitter
                      6: from Products.ZCTextIndex.PipelineFactory import element_factory
                      7: 
1.1.2.3   dwinter     8: 
1.1.2.1   dwinter     9: 
                     10: import re
                     11: from types import StringType
                     12: 
                     13: def getSupportedEncoding(encodings):
                     14:     for encoding in encodings:
                     15:         try:
                     16:             unicode('A', encoding)
                     17:             return encoding
                     18:         except:
                     19:             pass
                     20:     return 'utf-8'
                     21: 
                     22: # CJK charsets ranges, see this following pages:
                     23: #
                     24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
                     25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
                     26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
                     27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
                     28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
                     29: 
                     30: """
                     31: splitter for lastnames in database
                     32: 
                     33: """
1.1.2.2   dwinter    34: import re
1.1.2.1   dwinter    35: 
                     36: def quote(str):
                     37:     str=str.replace("'","\\\'")
                     38:     return str
                     39: class nameSplitter:
                     40: 
                     41:     default_encoding = "utf-8"
                     42: 
                     43:     def process(self, lsttmp):
1.1.2.3   dwinter    44:         import psycopg
1.1.2.1   dwinter    45:         result = []
                     46:         o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
                     47:         c = o.cursor() 
1.1.2.2   dwinter    48: #        replaceStr="<>;.:(),"
1.1.2.1   dwinter    49:         lst=" ".join(lsttmp)
1.1.2.2   dwinter    50: #        for x in replaceStr:
                     51: #            lst=lst.replace(x," ")
                     52:         lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
1.1.2.1   dwinter    53:         for s in lst.split():
1.1.2.2   dwinter    54: 
1.1.2.1   dwinter    55:             if type(s) is not StringType: # not unicode
                     56:                 s = s.decode(self.default_encoding)
1.1.2.2   dwinter    57: 
1.1.2.1   dwinter    58:             if s not in result: # check for database entry
1.1.2.2   dwinter    59: 
                     60:                 #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
                     61:                 c.execute("select lastname from persons where lastname = '%s'"%quote(s))
1.1.2.1   dwinter    62:                 if c.fetchone():
                     63:                     print "found",s
1.1.2.2   dwinter    64:                     result.append(s)
1.1.2.1   dwinter    65:         return result
                     66: 
                     67:  
                     68: try:
                     69:     element_factory.registerFactory('Word Splitter',
                     70:           'MPIWG Name Splitter', nameSplitter)
                     71: except:
                     72:     # in case the splitter is already registered, ValueError is raised
                     73:     pass
                     74: 
                     75: if __name__ == '__main__':
1.1.2.4 ! casties    76:    a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
1.1.2.1   dwinter    77:    u = unicode(a, 'gbk')
                     78:    s = authorSplitter()
                     79:    print s.process([u])
                     80:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>