annotate nameSplitter.py @ 222:95e0087b9e19

removed some comparisons that ignore person-id case.
author casties
date Mon, 28 Oct 2013 18:42:24 +0100
parents bca61e893fcc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
1 """
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
2 name splitter
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
3 """
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
4
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
5 from Products.ZCTextIndex.ISplitter import ISplitter
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
6 from Products.ZCTextIndex.PipelineFactory import element_factory
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
7
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
8
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
9
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
10 import re
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
11 from types import StringType
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
12
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
13 def getSupportedEncoding(encodings):
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
14 for encoding in encodings:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
15 try:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
16 unicode('A', encoding)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
17 return encoding
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
18 except:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
19 pass
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
20 return 'utf-8'
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
21
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
22 # CJK charsets ranges, see this following pages:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
23 #
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
24 # http://jrgraphix.net/research/unicode_blocks.php?block=87
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
25 # http://jrgraphix.net/research/unicode_blocks.php?block=85
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
26 # http://jrgraphix.net/research/unicode_blocks.php?block=95
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
27 # http://jrgraphix.net/research/unicode_blocks.php?block=76
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
28 # http://jrgraphix.net/research/unicode_blocks.php?block=90
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
29
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
30 """
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
31 splitter for lastnames in database
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
32
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
33 """
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
34 import re
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
35
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
36 def quote(str):
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
37 str=str.replace("'","\\\'")
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
38 return str
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
39 class nameSplitter:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
40
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
41 default_encoding = "utf-8"
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
42
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
43 def process(self, lsttmp):
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
44 import psycopg
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
45 result = []
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
46 o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
47 c = o.cursor()
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
48 # replaceStr="<>;.:(),"
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
49 lst=" ".join(lsttmp)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
50 # for x in replaceStr:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
51 # lst=lst.replace(x," ")
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
52 lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
53 for s in lst.split():
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
54
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
55 if type(s) is not StringType: # not unicode
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
56 s = s.decode(self.default_encoding)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
57
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
58 if s not in result: # check for database entry
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
59
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
60 #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
61 c.execute("select lastname from persons where lastname = '%s'"%quote(s))
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
62 if c.fetchone():
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
63 print "found",s
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
64 result.append(s)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
65 return result
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
66
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
67
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
68 try:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
69 element_factory.registerFactory('Word Splitter',
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
70 'MPIWG Name Splitter', nameSplitter)
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
71 except:
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
72 # in case the splitter is already registered, ValueError is raised
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
73 pass
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
74
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
75 if __name__ == '__main__':
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
76 a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
77 u = unicode(a, 'gbk')
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
78 s = authorSplitter()
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
79 print s.process([u])
bca61e893fcc first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff changeset
80 print s.process([u], 1)