Mercurial > hg > MPIWGWeb
annotate nameSplitter.py @ 170:485bf377913a
fix staff editing pages.
author | casties |
---|---|
date | Fri, 07 Jun 2013 16:29:34 +0200 |
parents | bca61e893fcc |
children |
rev | line source |
---|---|
0
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
1 """ |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
2 name splitter |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
3 """ |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
4 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
5 from Products.ZCTextIndex.ISplitter import ISplitter |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
6 from Products.ZCTextIndex.PipelineFactory import element_factory |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
7 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
8 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
9 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
10 import re |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
11 from types import StringType |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
12 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
13 def getSupportedEncoding(encodings): |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
14 for encoding in encodings: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
15 try: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
16 unicode('A', encoding) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
17 return encoding |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
18 except: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
19 pass |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
20 return 'utf-8' |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
21 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
22 # CJK charsets ranges, see this following pages: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
23 # |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
24 # http://jrgraphix.net/research/unicode_blocks.php?block=87 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
25 # http://jrgraphix.net/research/unicode_blocks.php?block=85 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
26 # http://jrgraphix.net/research/unicode_blocks.php?block=95 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
27 # http://jrgraphix.net/research/unicode_blocks.php?block=76 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
28 # http://jrgraphix.net/research/unicode_blocks.php?block=90 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
29 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
30 """ |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
31 splitter for lastnames in database |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
32 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
33 """ |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
34 import re |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
35 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
36 def quote(str): |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
37 str=str.replace("'","\\\'") |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
38 return str |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
39 class nameSplitter: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
40 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
41 default_encoding = "utf-8" |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
42 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
43 def process(self, lsttmp): |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
44 import psycopg |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
45 result = [] |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
46 o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
47 c = o.cursor() |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
48 # replaceStr="<>;.:()," |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
49 lst=" ".join(lsttmp) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
50 # for x in replaceStr: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
51 # lst=lst.replace(x," ") |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
52 lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
53 for s in lst.split(): |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
54 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
55 if type(s) is not StringType: # not unicode |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
56 s = s.decode(self.default_encoding) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
57 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
58 if s not in result: # check for database entry |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
59 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
60 #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower())) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
61 c.execute("select lastname from persons where lastname = '%s'"%quote(s)) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
62 if c.fetchone(): |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
63 print "found",s |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
64 result.append(s) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
65 return result |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
66 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
67 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
68 try: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
69 element_factory.registerFactory('Word Splitter', |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
70 'MPIWG Name Splitter', nameSplitter) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
71 except: |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
72 # in case the splitter is already registered, ValueError is raised |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
73 pass |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
74 |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
75 if __name__ == '__main__': |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
76 a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3' |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
77 u = unicode(a, 'gbk') |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
78 s = authorSplitter() |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
79 print s.process([u]) |
bca61e893fcc
first checkin of MPIWGWeb r2 branch from CVS into mercurial
casties
parents:
diff
changeset
|
80 print s.process([u], 1) |