Mercurial > hg > MPIWGWeb
annotate nameSplitter.py @ 215:c0dcb747cc41
handle problem of MPIWG: keys, MPIWG: in the keys is now removed.
| author | dwinter |
|---|---|
| date | Thu, 29 Aug 2013 09:10:01 +0200 |
| parents | bca61e893fcc |
| children |
| rev | line source |
|---|---|
|
0
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
1 """ |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
2 name splitter |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
3 """ |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
4 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
5 from Products.ZCTextIndex.ISplitter import ISplitter |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
6 from Products.ZCTextIndex.PipelineFactory import element_factory |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
7 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
8 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
9 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
10 import re |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
11 from types import StringType |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
12 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
13 def getSupportedEncoding(encodings): |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
14 for encoding in encodings: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
15 try: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
16 unicode('A', encoding) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
17 return encoding |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
18 except: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
19 pass |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
20 return 'utf-8' |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
21 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
22 # CJK charsets ranges, see this following pages: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
23 # |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
24 # http://jrgraphix.net/research/unicode_blocks.php?block=87 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
25 # http://jrgraphix.net/research/unicode_blocks.php?block=85 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
26 # http://jrgraphix.net/research/unicode_blocks.php?block=95 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
27 # http://jrgraphix.net/research/unicode_blocks.php?block=76 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
28 # http://jrgraphix.net/research/unicode_blocks.php?block=90 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
29 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
30 """ |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
31 splitter for lastnames in database |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
32 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
33 """ |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
34 import re |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
35 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
36 def quote(str): |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
37 str=str.replace("'","\\\'") |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
38 return str |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
39 class nameSplitter: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
40 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
41 default_encoding = "utf-8" |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
42 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
43 def process(self, lsttmp): |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
44 import psycopg |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
45 result = [] |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
46 o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
47 c = o.cursor() |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
48 # replaceStr="<>;.:()," |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
49 lst=" ".join(lsttmp) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
50 # for x in replaceStr: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
51 # lst=lst.replace(x," ") |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
52 lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
53 for s in lst.split(): |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
54 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
55 if type(s) is not StringType: # not unicode |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
56 s = s.decode(self.default_encoding) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
57 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
58 if s not in result: # check for database entry |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
59 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
60 #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower())) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
61 c.execute("select lastname from persons where lastname = '%s'"%quote(s)) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
62 if c.fetchone(): |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
63 print "found",s |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
64 result.append(s) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
65 return result |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
66 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
67 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
68 try: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
69 element_factory.registerFactory('Word Splitter', |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
70 'MPIWG Name Splitter', nameSplitter) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
71 except: |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
72 # in case the splitter is already registered, ValueError is raised |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
73 pass |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
74 |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
75 if __name__ == '__main__': |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
76 a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3' |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
77 u = unicode(a, 'gbk') |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
78 s = authorSplitter() |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
79 print s.process([u]) |
|
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
80 print s.process([u], 1) |
