Annotation of MPIWGWeb/nameSplitter.py, revision 1.1.2.1
1.1.2.1 ! dwinter 1: """
! 2: name splitter
! 3: """
! 4:
! 5: from Products.ZCTextIndex.ISplitter import ISplitter
! 6: from Products.ZCTextIndex.PipelineFactory import element_factory
! 7:
! 8: import psycopg
! 9:
! 10: import re
! 11: from types import StringType
! 12:
! 13: def getSupportedEncoding(encodings):
! 14: for encoding in encodings:
! 15: try:
! 16: unicode('A', encoding)
! 17: return encoding
! 18: except:
! 19: pass
! 20: return 'utf-8'
! 21:
! 22: # CJK charsets ranges, see this following pages:
! 23: #
! 24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
! 25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
! 26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
! 27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
! 28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
! 29:
! 30: """
! 31: splitter for lastnames in database
! 32:
! 33: """
! 34:
! 35: def quote(str):
! 36: str=str.replace("'","\\\'")
! 37: return str
! 38: class nameSplitter:
! 39:
! 40: default_encoding = "utf-8"
! 41:
! 42: def process(self, lsttmp):
! 43: result = []
! 44: o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
! 45: c = o.cursor()
! 46: replaceStr="<>;.:()"
! 47: lst=" ".join(lsttmp)
! 48: for x in replaceStr:
! 49: lst=lst.replace(x," ")
! 50:
! 51: for s in lst.split():
! 52: if type(s) is not StringType: # not unicode
! 53: s = s.decode(self.default_encoding)
! 54: if s not in result: # check for database entry
! 55: c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
! 56: if c.fetchone():
! 57: print "found",s
! 58: result.append(lastname)
! 59: return result
! 60:
! 61:
! 62: try:
! 63: element_factory.registerFactory('Word Splitter',
! 64: 'MPIWG Name Splitter', nameSplitter)
! 65: except:
! 66: # in case the splitter is already registered, ValueError is raised
! 67: pass
! 68:
! 69: if __name__ == '__main__':
! 70: a = 'abc def我们的很 好。'
! 71: u = unicode(a, 'gbk')
! 72: s = authorSplitter()
! 73: print s.process([u])
! 74: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>