MPIWGWeb/nameSplitter.py - view

File: [Repository] / MPIWGWeb / Attic / nameSplitter.py
Revision 1.1.2.1: download - view: text, annotated - select for diffs - revision graph
Sun Aug 27 05:40:45 2006 UTC (17 years, 10 months ago) by dwinter
Branches: r2

index

1: """ 2: name splitter 3: """ 4: 5: from Products.ZCTextIndex.ISplitter import ISplitter 6: from Products.ZCTextIndex.PipelineFactory import element_factory 7: 8: import psycopg 9: 10: import re 11: from types import StringType 12: 13: def getSupportedEncoding(encodings): 14: for encoding in encodings: 15: try: 16: unicode('A', encoding) 17: return encoding 18: except: 19: pass 20: return 'utf-8' 21: 22: # CJK charsets ranges, see this following pages: 23: # 24: # http://jrgraphix.net/research/unicode_blocks.php?block=87 25: # http://jrgraphix.net/research/unicode_blocks.php?block=85 26: # http://jrgraphix.net/research/unicode_blocks.php?block=95 27: # http://jrgraphix.net/research/unicode_blocks.php?block=76 28: # http://jrgraphix.net/research/unicode_blocks.php?block=90 29: 30: """ 31: splitter for lastnames in database 32: 33: """ 34: 35: def quote(str): 36: str=str.replace("'","\\\'") 37: return str 38: class nameSplitter: 39: 40: default_encoding = "utf-8" 41: 42: def process(self, lsttmp): 43: result = [] 44: o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 45: c = o.cursor() 46: replaceStr="<>;.:()" 47: lst=" ".join(lsttmp) 48: for x in replaceStr: 49: lst=lst.replace(x," ") 50: 51: for s in lst.split(): 52: if type(s) is not StringType: # not unicode 53: s = s.decode(self.default_encoding) 54: if s not in result: # check for database entry 55: c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s)) 56: if c.fetchone(): 57: print "found",s 58: result.append(lastname) 59: return result 60: 61: 62: try: 63: element_factory.registerFactory('Word Splitter', 64: 'MPIWG Name Splitter', nameSplitter) 65: except: 66: # in case the splitter is already registered, ValueError is raised 67: pass 68: 69: if __name__ == '__main__': 70: a = 'abc def我们的很好。' 71: u = unicode(a, 'gbk') 72: s = authorSplitter() 73: print s.process([u]) 74: print s.process([u], 1)