1: """
2: name splitter
3: """
4:
5: from Products.ZCTextIndex.ISplitter import ISplitter
6: from Products.ZCTextIndex.PipelineFactory import element_factory
7:
8: import psycopg
9:
10: import re
11: from types import StringType
12:
13: def getSupportedEncoding(encodings):
14: for encoding in encodings:
15: try:
16: unicode('A', encoding)
17: return encoding
18: except:
19: pass
20: return 'utf-8'
21:
22: # CJK charsets ranges, see this following pages:
23: #
24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
29:
30: """
31: splitter for lastnames in database
32:
33: """
34:
35: def quote(str):
36: str=str.replace("'","\\\'")
37: return str
38: class nameSplitter:
39:
40: default_encoding = "utf-8"
41:
42: def process(self, lsttmp):
43: result = []
44: o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
45: c = o.cursor()
46: replaceStr="<>;.:()"
47: lst=" ".join(lsttmp)
48: for x in replaceStr:
49: lst=lst.replace(x," ")
50:
51: for s in lst.split():
52: if type(s) is not StringType: # not unicode
53: s = s.decode(self.default_encoding)
54: if s not in result: # check for database entry
55: c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
56: if c.fetchone():
57: print "found",s
58: result.append(lastname)
59: return result
60:
61:
62: try:
63: element_factory.registerFactory('Word Splitter',
64: 'MPIWG Name Splitter', nameSplitter)
65: except:
66: # in case the splitter is already registered, ValueError is raised
67: pass
68:
69: if __name__ == '__main__':
70: a = 'abc def我们的很 好。'
71: u = unicode(a, 'gbk')
72: s = authorSplitter()
73: print s.process([u])
74: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>