File:  [Repository] / MPIWGWeb / Attic / nameSplitter.py
Revision 1.1.2.1: download - view: text, annotated - select for diffs - revision graph
Sun Aug 27 05:40:45 2006 UTC (17 years, 9 months ago) by dwinter
Branches: r2
index

"""
name splitter
"""

from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory

import psycopg

import re
from types import StringType

def getSupportedEncoding(encodings):
    for encoding in encodings:
        try:
            unicode('A', encoding)
            return encoding
        except:
            pass
    return 'utf-8'

# CJK charsets ranges, see this following pages:
#
# http://jrgraphix.net/research/unicode_blocks.php?block=87
# http://jrgraphix.net/research/unicode_blocks.php?block=85
# http://jrgraphix.net/research/unicode_blocks.php?block=95
# http://jrgraphix.net/research/unicode_blocks.php?block=76
# http://jrgraphix.net/research/unicode_blocks.php?block=90

"""
splitter for lastnames in database

"""

def quote(str):
    str=str.replace("'","\\\'")
    return str
class nameSplitter:

    default_encoding = "utf-8"

    def process(self, lsttmp):
        result = []
        o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
        c = o.cursor() 
        replaceStr="<>;.:()"
        lst=" ".join(lsttmp)
        for x in replaceStr:
            lst=lst.replace(x," ")
        
        for s in lst.split():
            if type(s) is not StringType: # not unicode
                s = s.decode(self.default_encoding)
            if s not in result: # check for database entry
                c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
                if c.fetchone():
                    print "found",s
                    result.append(lastname)
        return result

 
try:
    element_factory.registerFactory('Word Splitter',
          'MPIWG Name Splitter', nameSplitter)
except:
    # in case the splitter is already registered, ValueError is raised
    pass

if __name__ == '__main__':
   a = 'abc def我们的很 好。'
   u = unicode(a, 'gbk')
   s = authorSplitter()
   print s.process([u])
   print s.process([u], 1)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>