"""
name splitter
"""
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
import psycopg
import re
from types import StringType
def getSupportedEncoding(encodings):
for encoding in encodings:
try:
unicode('A', encoding)
return encoding
except:
pass
return 'utf-8'
# CJK charsets ranges, see this following pages:
#
# http://jrgraphix.net/research/unicode_blocks.php?block=87
# http://jrgraphix.net/research/unicode_blocks.php?block=85
# http://jrgraphix.net/research/unicode_blocks.php?block=95
# http://jrgraphix.net/research/unicode_blocks.php?block=76
# http://jrgraphix.net/research/unicode_blocks.php?block=90
"""
splitter for lastnames in database
"""
def quote(str):
str=str.replace("'","\\\'")
return str
class nameSplitter:
default_encoding = "utf-8"
def process(self, lsttmp):
result = []
o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
c = o.cursor()
replaceStr="<>;.:()"
lst=" ".join(lsttmp)
for x in replaceStr:
lst=lst.replace(x," ")
for s in lst.split():
if type(s) is not StringType: # not unicode
s = s.decode(self.default_encoding)
if s not in result: # check for database entry
c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
if c.fetchone():
print "found",s
result.append(lastname)
return result
try:
element_factory.registerFactory('Word Splitter',
'MPIWG Name Splitter', nameSplitter)
except:
# in case the splitter is already registered, ValueError is raised
pass
if __name__ == '__main__':
a = 'abc def我们的很 好。'
u = unicode(a, 'gbk')
s = authorSplitter()
print s.process([u])
print s.process([u], 1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>