Annotation of MPIWGWeb/nameSplitter.py, revision 1.1.2.4
1.1.2.1 dwinter 1: """
2: name splitter
3: """
4:
5: from Products.ZCTextIndex.ISplitter import ISplitter
6: from Products.ZCTextIndex.PipelineFactory import element_factory
7:
1.1.2.3 dwinter 8:
1.1.2.1 dwinter 9:
10: import re
11: from types import StringType
12:
13: def getSupportedEncoding(encodings):
14: for encoding in encodings:
15: try:
16: unicode('A', encoding)
17: return encoding
18: except:
19: pass
20: return 'utf-8'
21:
22: # CJK charsets ranges, see this following pages:
23: #
24: # http://jrgraphix.net/research/unicode_blocks.php?block=87
25: # http://jrgraphix.net/research/unicode_blocks.php?block=85
26: # http://jrgraphix.net/research/unicode_blocks.php?block=95
27: # http://jrgraphix.net/research/unicode_blocks.php?block=76
28: # http://jrgraphix.net/research/unicode_blocks.php?block=90
29:
30: """
31: splitter for lastnames in database
32:
33: """
1.1.2.2 dwinter 34: import re
1.1.2.1 dwinter 35:
36: def quote(str):
37: str=str.replace("'","\\\'")
38: return str
39: class nameSplitter:
40:
41: default_encoding = "utf-8"
42:
43: def process(self, lsttmp):
1.1.2.3 dwinter 44: import psycopg
1.1.2.1 dwinter 45: result = []
46: o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
47: c = o.cursor()
1.1.2.2 dwinter 48: # replaceStr="<>;.:(),"
1.1.2.1 dwinter 49: lst=" ".join(lsttmp)
1.1.2.2 dwinter 50: # for x in replaceStr:
51: # lst=lst.replace(x," ")
52: lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
1.1.2.1 dwinter 53: for s in lst.split():
1.1.2.2 dwinter 54:
1.1.2.1 dwinter 55: if type(s) is not StringType: # not unicode
56: s = s.decode(self.default_encoding)
1.1.2.2 dwinter 57:
1.1.2.1 dwinter 58: if s not in result: # check for database entry
1.1.2.2 dwinter 59:
60: #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
61: c.execute("select lastname from persons where lastname = '%s'"%quote(s))
1.1.2.1 dwinter 62: if c.fetchone():
63: print "found",s
1.1.2.2 dwinter 64: result.append(s)
1.1.2.1 dwinter 65: return result
66:
67:
68: try:
69: element_factory.registerFactory('Word Splitter',
70: 'MPIWG Name Splitter', nameSplitter)
71: except:
72: # in case the splitter is already registered, ValueError is raised
73: pass
74:
75: if __name__ == '__main__':
1.1.2.4 ! casties 76: a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
1.1.2.1 dwinter 77: u = unicode(a, 'gbk')
78: s = authorSplitter()
79: print s.process([u])
80: print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>