version 1.1, 2006/08/27 05:40:45
|
version 1.1.2.1, 2006/08/27 05:40:45
|
Line 0
|
Line 1
|
|
""" |
|
name splitter |
|
""" |
|
|
|
from Products.ZCTextIndex.ISplitter import ISplitter |
|
from Products.ZCTextIndex.PipelineFactory import element_factory |
|
|
|
import psycopg |
|
|
|
import re |
|
from types import StringType |
|
|
|
def getSupportedEncoding(encodings): |
|
for encoding in encodings: |
|
try: |
|
unicode('A', encoding) |
|
return encoding |
|
except: |
|
pass |
|
return 'utf-8' |
|
|
|
# CJK charsets ranges, see this following pages: |
|
# |
|
# http://jrgraphix.net/research/unicode_blocks.php?block=87 |
|
# http://jrgraphix.net/research/unicode_blocks.php?block=85 |
|
# http://jrgraphix.net/research/unicode_blocks.php?block=95 |
|
# http://jrgraphix.net/research/unicode_blocks.php?block=76 |
|
# http://jrgraphix.net/research/unicode_blocks.php?block=90 |
|
|
|
""" |
|
splitter for lastnames in database |
|
|
|
""" |
|
|
|
def quote(str): |
|
str=str.replace("'","\\\'") |
|
return str |
|
class nameSplitter: |
|
|
|
default_encoding = "utf-8" |
|
|
|
def process(self, lsttmp): |
|
result = [] |
|
o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) |
|
c = o.cursor() |
|
replaceStr="<>;.:()" |
|
lst=" ".join(lsttmp) |
|
for x in replaceStr: |
|
lst=lst.replace(x," ") |
|
|
|
for s in lst.split(): |
|
if type(s) is not StringType: # not unicode |
|
s = s.decode(self.default_encoding) |
|
if s not in result: # check for database entry |
|
c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s)) |
|
if c.fetchone(): |
|
print "found",s |
|
result.append(lastname) |
|
return result |
|
|
|
|
|
try: |
|
element_factory.registerFactory('Word Splitter', |
|
'MPIWG Name Splitter', nameSplitter) |
|
except: |
|
# in case the splitter is already registered, ValueError is raised |
|
pass |
|
|
|
if __name__ == '__main__': |
|
a = 'abc def我们的很 好。' |
|
u = unicode(a, 'gbk') |
|
s = authorSplitter() |
|
print s.process([u]) |
|
print s.process([u], 1) |