File:  [Repository] / MPIWGWeb / Attic / nameSplitter.py
Revision 1.1.2.4: download - view: text, annotated - select for diffs - revision graph
Mon Feb 15 19:04:24 2010 UTC (14 years, 4 months ago) by casties
Branches: r2
fixing small errors for zope 2.12

    1: """
    2: name splitter
    3: """
    4: 
    5: from Products.ZCTextIndex.ISplitter import ISplitter
    6: from Products.ZCTextIndex.PipelineFactory import element_factory
    7: 
    8: 
    9: 
   10: import re
   11: from types import StringType
   12: 
   13: def getSupportedEncoding(encodings):
   14:     for encoding in encodings:
   15:         try:
   16:             unicode('A', encoding)
   17:             return encoding
   18:         except:
   19:             pass
   20:     return 'utf-8'
   21: 
   22: # CJK charsets ranges, see this following pages:
   23: #
   24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
   25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
   26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
   27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
   28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
   29: 
   30: """
   31: splitter for lastnames in database
   32: 
   33: """
   34: import re
   35: 
   36: def quote(str):
   37:     str=str.replace("'","\\\'")
   38:     return str
   39: class nameSplitter:
   40: 
   41:     default_encoding = "utf-8"
   42: 
   43:     def process(self, lsttmp):
   44:         import psycopg
   45:         result = []
   46:         o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
   47:         c = o.cursor() 
   48: #        replaceStr="<>;.:(),"
   49:         lst=" ".join(lsttmp)
   50: #        for x in replaceStr:
   51: #            lst=lst.replace(x," ")
   52:         lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
   53:         for s in lst.split():
   54: 
   55:             if type(s) is not StringType: # not unicode
   56:                 s = s.decode(self.default_encoding)
   57: 
   58:             if s not in result: # check for database entry
   59: 
   60:                 #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
   61:                 c.execute("select lastname from persons where lastname = '%s'"%quote(s))
   62:                 if c.fetchone():
   63:                     print "found",s
   64:                     result.append(s)
   65:         return result
   66: 
   67:  
   68: try:
   69:     element_factory.registerFactory('Word Splitter',
   70:           'MPIWG Name Splitter', nameSplitter)
   71: except:
   72:     # in case the splitter is already registered, ValueError is raised
   73:     pass
   74: 
   75: if __name__ == '__main__':
   76:    a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
   77:    u = unicode(a, 'gbk')
   78:    s = authorSplitter()
   79:    print s.process([u])
   80:    print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>