Mercurial > hg > MPIWGWeb

"""
name splitter
"""

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory


import re
from types import StringType

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'

# CJK charsets ranges, see this following pages:
#
# http://jrgraphix.net/research/unicode_blocks.php?block=87
# http://jrgraphix.net/research/unicode_blocks.php?block=85
# http://jrgraphix.net/research/unicode_blocks.php?block=95
# http://jrgraphix.net/research/unicode_blocks.php?block=76
# http://jrgraphix.net/research/unicode_blocks.php?block=90

"""
splitter for lastnames in database

"""
import re

def quote(str):
    str=str.replace("'","\\\'")
    return str
class nameSplitter:

    default_encoding = "utf-8"

    def process(self, lsttmp):
        import psycopg
        result = []
        o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0)
        c = o.cursor()
#        replaceStr="<>;.:(),"
        lst=" ".join(lsttmp)
#        for x in replaceStr:
#            lst=lst.replace(x," ")
        lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
        for s in lst.split():

            if type(s) is not StringType: # not unicode
                s = s.decode(self.default_encoding)

            if s not in result: # check for database entry

                #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
                c.execute("select lastname from persons where lastname = '%s'"%quote(s))
                if c.fetchone():
                    print "found",s
                    result.append(s)
        return result


try:
    element_factory.registerFactory('Word Splitter',
          'MPIWG Name Splitter', nameSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

if __name__ == '__main__':
   a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
   u = unicode(a, 'gbk')
   s = authorSplitter()
   print s.process([u])
   print s.process([u], 1)
author	casties
date	Thu, 25 Jun 2015 17:44:57 +0200
parents	bca61e893fcc
children