--- MPIWGWeb/Attic/nameSplitter.py 2006/08/27 05:40:45 1.1 +++ MPIWGWeb/Attic/nameSplitter.py 2006/09/13 08:17:33 1.1.2.2 @@ -0,0 +1,80 @@ +""" +name splitter +""" + +from Products.ZCTextIndex.ISplitter import ISplitter +from Products.ZCTextIndex.PipelineFactory import element_factory + +import psycopg + +import re +from types import StringType + +def getSupportedEncoding(encodings): + for encoding in encodings: + try: + unicode('A', encoding) + return encoding + except: + pass + return 'utf-8' + +# CJK charsets ranges, see this following pages: +# +# http://jrgraphix.net/research/unicode_blocks.php?block=87 +# http://jrgraphix.net/research/unicode_blocks.php?block=85 +# http://jrgraphix.net/research/unicode_blocks.php?block=95 +# http://jrgraphix.net/research/unicode_blocks.php?block=76 +# http://jrgraphix.net/research/unicode_blocks.php?block=90 + +""" +splitter for lastnames in database + +""" +import re + +def quote(str): + str=str.replace("'","\\\'") + return str +class nameSplitter: + + default_encoding = "utf-8" + + def process(self, lsttmp): + print "XXX" + result = [] + o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) + c = o.cursor() +# replaceStr="<>;.:()," + lst=" ".join(lsttmp) +# for x in replaceStr: +# lst=lst.replace(x," ") + lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst) + for s in lst.split(): + + if type(s) is not StringType: # not unicode + s = s.decode(self.default_encoding) + + if s not in result: # check for database entry + + #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower())) + c.execute("select lastname from persons where lastname = '%s'"%quote(s)) + if c.fetchone(): + print "found",s + result.append(s) + return result + + +try: + element_factory.registerFactory('Word Splitter', + 'MPIWG Name Splitter', nameSplitter) +except: + # in case the splitter is already registered, ValueError is raised + pass + +if __name__ == '__main__': + a = 'abc def我们的很 好。' + u = unicode(a, 'gbk') + s = authorSplitter() + print s.process([u]) + print s.process([u], 1)