Diff for /MPIWGWeb/Attic/nameSplitter.py between versions 1.1 and 1.1.2.1

version 1.1, 2006/08/27 05:40:45 version 1.1.2.1, 2006/08/27 05:40:45
Line 0 Line 1
   """
   name splitter
   """
   
   from Products.ZCTextIndex.ISplitter import ISplitter
   from Products.ZCTextIndex.PipelineFactory import element_factory
   
   import psycopg
   
   import re
   from types import StringType
   
   def getSupportedEncoding(encodings):
       for encoding in encodings:
           try:
               unicode('A', encoding)
               return encoding
           except:
               pass
       return 'utf-8'
   
   # CJK charsets ranges, see this following pages:
   #
   # http://jrgraphix.net/research/unicode_blocks.php?block=87
   # http://jrgraphix.net/research/unicode_blocks.php?block=85
   # http://jrgraphix.net/research/unicode_blocks.php?block=95
   # http://jrgraphix.net/research/unicode_blocks.php?block=76
   # http://jrgraphix.net/research/unicode_blocks.php?block=90
   
   """
   splitter for lastnames in database
   
   """
   
   def quote(str):
       str=str.replace("'","\\\'")
       return str
   class nameSplitter:
   
       default_encoding = "utf-8"
   
       def process(self, lsttmp):
           result = []
           o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
           c = o.cursor() 
           replaceStr="<>;.:()"
           lst=" ".join(lsttmp)
           for x in replaceStr:
               lst=lst.replace(x," ")
           
           for s in lst.split():
               if type(s) is not StringType: # not unicode
                   s = s.decode(self.default_encoding)
               if s not in result: # check for database entry
                   c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s))
                   if c.fetchone():
                       print "found",s
                       result.append(lastname)
           return result
   
    
   try:
       element_factory.registerFactory('Word Splitter',
             'MPIWG Name Splitter', nameSplitter)
   except:
       # in case the splitter is already registered, ValueError is raised
       pass
   
   if __name__ == '__main__':
      a = 'abc def我们的很 好。'
      u = unicode(a, 'gbk')
      s = authorSplitter()
      print s.process([u])
      print s.process([u], 1)

Removed from v.1.1  
changed lines
  Added in v.1.1.2.1


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>