diff nameSplitter.py @ 0:bca61e893fcc

first checkin of MPIWGWeb r2 branch from CVS into mercurial
author casties
date Thu, 10 Jan 2013 17:52:13 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nameSplitter.py	Thu Jan 10 17:52:13 2013 +0100
@@ -0,0 +1,80 @@
+"""
+name splitter
+"""
+
+from Products.ZCTextIndex.ISplitter import ISplitter
+from Products.ZCTextIndex.PipelineFactory import element_factory
+
+
+
+import re
+from types import StringType
+
+def getSupportedEncoding(encodings):
+    for encoding in encodings:
+        try:
+            unicode('A', encoding)
+            return encoding
+        except:
+            pass
+    return 'utf-8'
+
+# CJK charsets ranges, see this following pages:
+#
+# http://jrgraphix.net/research/unicode_blocks.php?block=87
+# http://jrgraphix.net/research/unicode_blocks.php?block=85
+# http://jrgraphix.net/research/unicode_blocks.php?block=95
+# http://jrgraphix.net/research/unicode_blocks.php?block=76
+# http://jrgraphix.net/research/unicode_blocks.php?block=90
+
+"""
+splitter for lastnames in database
+
+"""
+import re
+
+def quote(str):
+    str=str.replace("'","\\\'")
+    return str
+class nameSplitter:
+
+    default_encoding = "utf-8"
+
+    def process(self, lsttmp):
+        import psycopg
+        result = []
+        o = psycopg.connect('dbname=authorities user=dwinter password=3333',serialize=0) 
+        c = o.cursor() 
+#        replaceStr="<>;.:(),"
+        lst=" ".join(lsttmp)
+#        for x in replaceStr:
+#            lst=lst.replace(x," ")
+        lst=re.sub("[<|>|;|.|:|\(|\|)|,]", " ", lst)
+        for s in lst.split():
+
+            if type(s) is not StringType: # not unicode
+                s = s.decode(self.default_encoding)
+
+            if s not in result: # check for database entry
+
+                #c.execute("select lastname from persons where lower(lastname) = '%s'"%quote(s.lower()))
+                c.execute("select lastname from persons where lastname = '%s'"%quote(s))
+                if c.fetchone():
+                    print "found",s
+                    result.append(s)
+        return result
+
+ 
+try:
+    element_factory.registerFactory('Word Splitter',
+          'MPIWG Name Splitter', nameSplitter)
+except:
+    # in case the splitter is already registered, ValueError is raised
+    pass
+
+if __name__ == '__main__':
+   a = 'abc def\U00CE\U00D2\U00D3\U00C7\U00B5\U00C4 \U00DC\U00C3\U00A1\U00A3'
+   u = unicode(a, 'gbk')
+   s = authorSplitter()
+   print s.process([u])
+   print s.process([u], 1)