""" name splitter """ from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.PipelineFactory import element_factory import psycopg import re from types import StringType def getSupportedEncoding(encodings): for encoding in encodings: try: unicode('A', encoding) return encoding except: pass return 'utf-8' # CJK charsets ranges, see this following pages: # # http://jrgraphix.net/research/unicode_blocks.php?block=87 # http://jrgraphix.net/research/unicode_blocks.php?block=85 # http://jrgraphix.net/research/unicode_blocks.php?block=95 # http://jrgraphix.net/research/unicode_blocks.php?block=76 # http://jrgraphix.net/research/unicode_blocks.php?block=90 """ splitter for lastnames in database """ import re def quote(str): str=str.replace("'","\\\'") return str class nameSplitter: default_encoding = "utf-8" def process(self, lsttmp): print "XXX" result = [] o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) c = o.cursor() # replaceStr="<>;.:()," lst=" ".join(lsttmp) # for x in replaceStr: # lst=lst.replace(x," ") lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst) for s in lst.split(): if type(s) is not StringType: # not unicode s = s.decode(self.default_encoding) if s not in result: # check for database entry #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower())) c.execute("select lastname from persons where lastname = '%s'"%quote(s)) if c.fetchone(): print "found",s result.append(s) return result try: element_factory.registerFactory('Word Splitter', 'MPIWG Name Splitter', nameSplitter) except: # in case the splitter is already registered, ValueError is raised pass if __name__ == '__main__': a = 'abc def我们的很 好。' u = unicode(a, 'gbk') s = authorSplitter() print s.process([u]) print s.process([u], 1)