changeset 16:c009ce2e60be

working on arabic translit normalizer.
author casties
date Tue, 12 May 2015 15:39:29 +0000
parents 728549225b02
children ac466a164b61
files src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java src/main/java/org/mpi/openmind/scripts/NormalizeOW.java
diffstat 2 files changed, 146 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java	Tue May 12 08:32:23 2015 +0000
+++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java	Tue May 12 15:39:29 2015 +0000
@@ -1,22 +1,105 @@
 package org.mpi.openmind.repository.utils;
 
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.regex.Pattern;
 
+/**
+ * @author casties
+ *
+ * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit
+ * 
+ */
 public class ArabicTranslitNormalizer {
 
-    protected static Map<String, Pattern> multiRepPat = new HashMap<String, Pattern>();
-    {
-        multiRepPat.put("j", Pattern.compile("ch"));
-        multiRepPat.put("j", Pattern.compile("dj"));
-        multiRepPat.put("t", Pattern.compile("th"));
-        multiRepPat.put("h", Pattern.compile("kh"));
-        multiRepPat.put("d", Pattern.compile("dh"));
-        multiRepPat.put("s", Pattern.compile("sh"));
-        multiRepPat.put("g", Pattern.compile("gh"));
-        multiRepPat.put("j", Pattern.compile("ch"));
-        // aẗ\b, at\b, ah\b -> a 
-        multiRepPat.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"));
+    protected static Map<String, Pattern> apostrophePatterns = new HashMap<String, Pattern>();
+    static {
+        // `, ʿ, ʾ, ‘, ’ -> '
+        apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019"));
+    }    
+    
+    protected static Map<String, Pattern> twoletterPatterns = new HashMap<String, Pattern>();
+    static {
+        twoletterPatterns.put("j", Pattern.compile("ch"));
+        twoletterPatterns.put("j", Pattern.compile("dj"));
+        twoletterPatterns.put("t", Pattern.compile("th"));
+        twoletterPatterns.put("h", Pattern.compile("kh"));
+        twoletterPatterns.put("d", Pattern.compile("dh"));
+        twoletterPatterns.put("s", Pattern.compile("sh"));
+        twoletterPatterns.put("g", Pattern.compile("gh"));
+    }
+
+    protected static Map<String, Pattern> wordpartPatterns = new HashMap<String, Pattern>();
+    static {
+        // aẗ\b, at\b, ah\b -> a
+        wordpartPatterns.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"));
+        // 'abd + space -> 'abd
+        wordpartPatterns.put("'abd", Pattern.compile("'abd "));
+    }
+
+    protected static Map<String, Pattern> letterdiacritPatterns = new HashMap<String, Pattern>();
+    static {
+        // ỳ -> a
+        letterdiacritPatterns.put("a", Pattern.compile("\u1EF3"));
+    }
+
+    protected static Map<String, Pattern> letterPatterns = new HashMap<String, Pattern>();
+    static {
+        letterPatterns.put("j", Pattern.compile("g|c"));
+    }
+    
+    protected static Pattern diacriticsPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
+    
+    public static String normalize(String text) {
+        if (text == null || text.isEmpty()) {
+            return text;
+        }
+        
+        // everything is lowercase TODO: locale?
+        text = text.toLowerCase();
+        
+        // replace "apostrophes"
+        for (Entry<String, Pattern> entry : apostrophePatterns.entrySet()) {
+            Pattern pattern = entry.getValue();
+            String replacement = entry.getKey();
+            text = pattern.matcher(text).replaceAll(replacement);
+        }
+        
+        // replace two-letter combinations
+        for (Entry<String, Pattern> entry : twoletterPatterns.entrySet()) {
+            Pattern pattern = entry.getValue();
+            String replacement = entry.getKey();
+            text = pattern.matcher(text).replaceAll(replacement);
+        }
+        
+        // replace word-parts
+        for (Entry<String, Pattern> entry : wordpartPatterns.entrySet()) {
+            Pattern pattern = entry.getValue();
+            String replacement = entry.getKey();
+            text = pattern.matcher(text).replaceAll(replacement);
+        }
+
+        // replace letters with diacritics
+        for (Entry<String, Pattern> entry : letterdiacritPatterns.entrySet()) {
+            Pattern pattern = entry.getValue();
+            String replacement = entry.getKey();
+            text = pattern.matcher(text).replaceAll(replacement);
+        }
+
+        // remove diacritics by de-composing and removing diacritical marks
+        text = Normalizer.normalize(text, Form.NFD);
+        text = diacriticsPattern.matcher(text).replaceAll("");
+        
+        // replace letters
+        for (Entry<String, Pattern> entry : letterPatterns.entrySet()) {
+            Pattern pattern = entry.getValue();
+            String replacement = entry.getKey();
+            text = pattern.matcher(text).replaceAll(replacement);
+        }
+
+        return text;
     }
 }
--- a/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java	Tue May 12 08:32:23 2015 +0000
+++ b/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java	Tue May 12 15:39:29 2015 +0000
@@ -5,29 +5,27 @@
 import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.sql.Statement;
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
-import org.apache.commons.lang.StringUtils;
 import org.mpi.openmind.repository.utils.ArabicNormalizerUtils;
+import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer;
 import org.mpi.openmind.repository.utils.NormalizerUtils;
 
 public class NormalizeOW {
-	public static void execute(String type) {
+	public static void execute(String type, String dbUser, String dbPw) {
 		try {
 			System.out.println("Normalizing own values for all: " + type +"S.");
-			System.out.println("INFO: only for the CURRENT_VERSION of the mentioned nodes will be affected.");
+			System.out.println("INFO: only the CURRENT_VERSION of the nodes will be affected.");
 			Connection conn;
 
 			Class.forName("com.mysql.jdbc.Driver").newInstance();
 			String url = "jdbc:mysql://localhost/openmind?characterEncoding=UTF-8";
-			conn = DriverManager.getConnection(url, "root", "admin");
+			conn = DriverManager.getConnection(url, dbUser, dbPw);
 			
-			Map<Long, String> selectedMap = select(conn, type);
+			Map<Long, String[]> selectedMap = select(conn, type);
 			
-			System.out.println("Amount of nodes=" + selectedMap.size());
+			System.out.println("Number of nodes=" + selectedMap.size());
 			change(conn, selectedMap);
 			System.out.println("End");
 			
@@ -44,17 +42,25 @@
 
 	}
 	
-	public static void change(Connection conn, Map<Long, String> map){
+	public static void change(Connection conn, Map<Long, String[]> map){
 		String s = new String();
 		for(Long id : map.keySet()){
 			try {
-		    	Statement st = conn.createStatement();
-		    	String normalizedOW = NormalizerUtils.normalize(map.get(id));
-		    	String normalizedArabicOW = ArabicNormalizerUtils.normalize(map.get(id));
-				st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'");
+		    	String[] ows = map.get(id);
+		    	String ow = ows[0];
+		    	String oldNormalizedOW = ows[1];
+		    	String oldNormalizedArabicOW = ows[2];
+		    	String normalizedOW = ArabicTranslitNormalizer.normalize(ow);
+		    	String normalizedArabicOW = ArabicNormalizerUtils.normalize(ow);
+		    	if (normalizedOW != null && !normalizedOW.equals(oldNormalizedOW)) {
+		    	    System.out.println("normOW changes ("+id+"): old="+oldNormalizedOW+" new="+normalizedOW);
+		    	}
+                Statement st = conn.createStatement();
+				/* st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'");
 				s = "UPDATE node SET normalized_arabic_own_value='" + normalizedArabicOW + "' WHERE row_id='"+ id +"'";
 				//System.out.println(s);
 				st.executeUpdate(s);
+				*/
 			} catch (SQLException e) {
 				System.err.println(s);
 				e.printStackTrace();
@@ -62,38 +68,52 @@
 		}
 	}
 	
-	@SuppressWarnings("finally")
-	public static Map<Long, String> select(Connection conn, String type){
-		Map<Long, String> map = new HashMap<Long, String>();
-		String query = "select row_id, own_value " +
+	public static Map<Long, String[]> select(Connection conn, String type){
+		Map<Long, String[]> map = new HashMap<Long, String[]>();
+		String query = "select row_id, own_value, normalized_own_value, normalized_arabic_own_value " +
 				"from node " +
 				"where system_status = 'CURRENT_VERSION'";
 				
-		if(type.equals("ATTRIBUTE") || type.equals("ENTITY")){
+		if (type.equals("ATTRIBUTE") || type.equals("ENTITY")) {
 			query += " AND node_type = '"+ type +"'";
 		}
 				
-		try
-	    {
+		try {
 	      Statement st = conn.createStatement();
 	      ResultSet rs = st.executeQuery(query);
-	      while (rs.next())
-	      {
+	      while (rs.next()) {
 	        String id = rs.getString("row_id");
-	        String ow = rs.getString("own_value");
-	        map.put(new Long(id), ow);
+	        String[] ows = new String[3];
+	        ows[0] = rs.getString("own_value");
+	        ows[1] = rs.getString("normalized_own_value");
+	        ows[2] = rs.getString("normalized_arabic_own_value");
+	        map.put(new Long(id), ows);
 	      }
+	    } catch (SQLException ex) {
+	        ex.printStackTrace();
+	        System.err.println(ex.getMessage());
 	    }
-	    catch (SQLException ex){
-	    	ex.printStackTrace();
-	      //System.err.println(ex.getMessage());
-	    }finally{
-	    	return map;
-	    }
+        return map;
 	}
 
 	public static void main(String[] args) {
-		NormalizeOW.execute("all");
+        int rc = 0;
+        if (args.length > 1 && args.length < 4) {
+            String user = args[1];
+            String pw = (args.length == 3) ? args[2] : null;
+            if (args[0].equalsIgnoreCase("fix")) {
+                //rc = repair(user, pw);
+            } else {
+                execute("all", user, pw);
+            }
+        } else {
+            System.out.println("Parameter/s not found! Should be: mode(SHOW/FIX), mysql_user, mysql_password");
+            System.out.println("  got: "+args.toString() + "("+args.length+")");
+            System.exit(1);
+        }
+        System.exit(rc);
+        
+        //NormalizeOW.execute("all");
 		/*
 		String arg = args[0];
 		if(StringUtils.isNotEmpty(arg)){