MPIWGWeb/nameSplitter.py - view

File: [Repository] / MPIWGWeb / Attic / nameSplitter.py
Revision 1.1.2.4: download - view: text, annotated - select for diffs - revision graph
Mon Feb 15 19:04:24 2010 UTC (14 years, 4 months ago) by casties
Branches: r2

fixing small errors for zope 2.12

1: """ 2: name splitter 3: """ 4: 5: from Products.ZCTextIndex.ISplitter import ISplitter 6: from Products.ZCTextIndex.PipelineFactory import element_factory 7: 8: 9: 10: import re 11: from types import StringType 12: 13: def getSupportedEncoding(encodings): 14: for encoding in encodings: 15: try: 16: unicode('A', encoding) 17: return encoding 18: except: 19: pass 20: return 'utf-8' 21: 22: # CJK charsets ranges, see this following pages: 23: # 24: # http://jrgraphix.net/research/unicode_blocks.php?block=87 25: # http://jrgraphix.net/research/unicode_blocks.php?block=85 26: # http://jrgraphix.net/research/unicode_blocks.php?block=95 27: # http://jrgraphix.net/research/unicode_blocks.php?block=76 28: # http://jrgraphix.net/research/unicode_blocks.php?block=90 29: 30: """ 31: splitter for lastnames in database 32: 33: """ 34: import re 35: 36: def quote(str): 37: str=str.replace("'","\\\'") 38: return str 39: class nameSplitter: 40: 41: default_encoding = "utf-8" 42: 43: def process(self, lsttmp): 44: import psycopg 45: result = [] 46: o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 47: c = o.cursor() 48: # replaceStr="<>;.:()," 49: lst=" ".join(lsttmp) 50: # for x in replaceStr: 51: # lst=lst.replace(x," ") 52: lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst) 53: for s in lst.split(): 54: 55: if type(s) is not StringType: # not unicode 56: s = s.decode(self.default_encoding) 57: 58: if s not in result: # check for database entry 59: 60: #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower())) 61: c.execute("select lastname from persons where lastname = '%s'"%quote(s)) 62: if c.fetchone(): 63: print "found",s 64: result.append(s) 65: return result 66: 67: 68: try: 69: element_factory.registerFactory('Word Splitter', 70: 'MPIWG Name Splitter', nameSplitter) 71: except: 72: # in case the splitter is already registered, ValueError is raised 73: pass 74: 75: if __name__ == '__main__': 76: a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3' 77: u = unicode(a, 'gbk') 78: s = authorSplitter() 79: print s.process([u]) 80: print s.process([u], 1)