Mercurial > hg > openmind
changeset 16:c009ce2e60be
working on arabic translit normalizer.
author | casties |
---|---|
date | Tue, 12 May 2015 15:39:29 +0000 |
parents | 728549225b02 |
children | ac466a164b61 |
files | src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java src/main/java/org/mpi/openmind/scripts/NormalizeOW.java |
diffstat | 2 files changed, 146 insertions(+), 43 deletions(-) [+] |
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Tue May 12 08:32:23 2015 +0000 +++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Tue May 12 15:39:29 2015 +0000 @@ -1,22 +1,105 @@ package org.mpi.openmind.repository.utils; +import java.text.Normalizer; +import java.text.Normalizer.Form; import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; import java.util.regex.Pattern; +/** + * @author casties + * + * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit + * + */ public class ArabicTranslitNormalizer { - protected static Map<String, Pattern> multiRepPat = new HashMap<String, Pattern>(); - { - multiRepPat.put("j", Pattern.compile("ch")); - multiRepPat.put("j", Pattern.compile("dj")); - multiRepPat.put("t", Pattern.compile("th")); - multiRepPat.put("h", Pattern.compile("kh")); - multiRepPat.put("d", Pattern.compile("dh")); - multiRepPat.put("s", Pattern.compile("sh")); - multiRepPat.put("g", Pattern.compile("gh")); - multiRepPat.put("j", Pattern.compile("ch")); - // aẗ\b, at\b, ah\b -> a - multiRepPat.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b")); + protected static Map<String, Pattern> apostrophePatterns = new HashMap<String, Pattern>(); + static { + // `, ʿ, ʾ, ‘, ’ -> ' + apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019")); + } + + protected static Map<String, Pattern> twoletterPatterns = new HashMap<String, Pattern>(); + static { + twoletterPatterns.put("j", Pattern.compile("ch")); + twoletterPatterns.put("j", Pattern.compile("dj")); + twoletterPatterns.put("t", Pattern.compile("th")); + twoletterPatterns.put("h", Pattern.compile("kh")); + twoletterPatterns.put("d", Pattern.compile("dh")); + twoletterPatterns.put("s", Pattern.compile("sh")); + twoletterPatterns.put("g", Pattern.compile("gh")); + } + + protected static Map<String, Pattern> wordpartPatterns = new HashMap<String, Pattern>(); + static { + // aẗ\b, at\b, ah\b -> a + wordpartPatterns.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b")); + // 'abd + space -> 'abd + wordpartPatterns.put("'abd", Pattern.compile("'abd ")); + } + + protected static Map<String, Pattern> letterdiacritPatterns = new HashMap<String, Pattern>(); + static { + // ỳ -> a + letterdiacritPatterns.put("a", Pattern.compile("\u1EF3")); + } + + protected static Map<String, Pattern> letterPatterns = new HashMap<String, Pattern>(); + static { + letterPatterns.put("j", Pattern.compile("g|c")); + } + + protected static Pattern diacriticsPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + + public static String normalize(String text) { + if (text == null || text.isEmpty()) { + return text; + } + + // everything is lowercase TODO: locale? + text = text.toLowerCase(); + + // replace "apostrophes" + for (Entry<String, Pattern> entry : apostrophePatterns.entrySet()) { + Pattern pattern = entry.getValue(); + String replacement = entry.getKey(); + text = pattern.matcher(text).replaceAll(replacement); + } + + // replace two-letter combinations + for (Entry<String, Pattern> entry : twoletterPatterns.entrySet()) { + Pattern pattern = entry.getValue(); + String replacement = entry.getKey(); + text = pattern.matcher(text).replaceAll(replacement); + } + + // replace word-parts + for (Entry<String, Pattern> entry : wordpartPatterns.entrySet()) { + Pattern pattern = entry.getValue(); + String replacement = entry.getKey(); + text = pattern.matcher(text).replaceAll(replacement); + } + + // replace letters with diacritics + for (Entry<String, Pattern> entry : letterdiacritPatterns.entrySet()) { + Pattern pattern = entry.getValue(); + String replacement = entry.getKey(); + text = pattern.matcher(text).replaceAll(replacement); + } + + // remove diacritics by de-composing and removing diacritical marks + text = Normalizer.normalize(text, Form.NFD); + text = diacriticsPattern.matcher(text).replaceAll(""); + + // replace letters + for (Entry<String, Pattern> entry : letterPatterns.entrySet()) { + Pattern pattern = entry.getValue(); + String replacement = entry.getKey(); + text = pattern.matcher(text).replaceAll(replacement); + } + + return text; } }
--- a/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Tue May 12 08:32:23 2015 +0000 +++ b/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Tue May 12 15:39:29 2015 +0000 @@ -5,29 +5,27 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; -import org.apache.commons.lang.StringUtils; import org.mpi.openmind.repository.utils.ArabicNormalizerUtils; +import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer; import org.mpi.openmind.repository.utils.NormalizerUtils; public class NormalizeOW { - public static void execute(String type) { + public static void execute(String type, String dbUser, String dbPw) { try { System.out.println("Normalizing own values for all: " + type +"S."); - System.out.println("INFO: only for the CURRENT_VERSION of the mentioned nodes will be affected."); + System.out.println("INFO: only the CURRENT_VERSION of the nodes will be affected."); Connection conn; Class.forName("com.mysql.jdbc.Driver").newInstance(); String url = "jdbc:mysql://localhost/openmind?characterEncoding=UTF-8"; - conn = DriverManager.getConnection(url, "root", "admin"); + conn = DriverManager.getConnection(url, dbUser, dbPw); - Map<Long, String> selectedMap = select(conn, type); + Map<Long, String[]> selectedMap = select(conn, type); - System.out.println("Amount of nodes=" + selectedMap.size()); + System.out.println("Number of nodes=" + selectedMap.size()); change(conn, selectedMap); System.out.println("End"); @@ -44,17 +42,25 @@ } - public static void change(Connection conn, Map<Long, String> map){ + public static void change(Connection conn, Map<Long, String[]> map){ String s = new String(); for(Long id : map.keySet()){ try { - Statement st = conn.createStatement(); - String normalizedOW = NormalizerUtils.normalize(map.get(id)); - String normalizedArabicOW = ArabicNormalizerUtils.normalize(map.get(id)); - st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'"); + String[] ows = map.get(id); + String ow = ows[0]; + String oldNormalizedOW = ows[1]; + String oldNormalizedArabicOW = ows[2]; + String normalizedOW = ArabicTranslitNormalizer.normalize(ow); + String normalizedArabicOW = ArabicNormalizerUtils.normalize(ow); + if (normalizedOW != null && !normalizedOW.equals(oldNormalizedOW)) { + System.out.println("normOW changes ("+id+"): old="+oldNormalizedOW+" new="+normalizedOW); + } + Statement st = conn.createStatement(); + /* st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'"); s = "UPDATE node SET normalized_arabic_own_value='" + normalizedArabicOW + "' WHERE row_id='"+ id +"'"; //System.out.println(s); st.executeUpdate(s); + */ } catch (SQLException e) { System.err.println(s); e.printStackTrace(); @@ -62,38 +68,52 @@ } } - @SuppressWarnings("finally") - public static Map<Long, String> select(Connection conn, String type){ - Map<Long, String> map = new HashMap<Long, String>(); - String query = "select row_id, own_value " + + public static Map<Long, String[]> select(Connection conn, String type){ + Map<Long, String[]> map = new HashMap<Long, String[]>(); + String query = "select row_id, own_value, normalized_own_value, normalized_arabic_own_value " + "from node " + "where system_status = 'CURRENT_VERSION'"; - if(type.equals("ATTRIBUTE") || type.equals("ENTITY")){ + if (type.equals("ATTRIBUTE") || type.equals("ENTITY")) { query += " AND node_type = '"+ type +"'"; } - try - { + try { Statement st = conn.createStatement(); ResultSet rs = st.executeQuery(query); - while (rs.next()) - { + while (rs.next()) { String id = rs.getString("row_id"); - String ow = rs.getString("own_value"); - map.put(new Long(id), ow); + String[] ows = new String[3]; + ows[0] = rs.getString("own_value"); + ows[1] = rs.getString("normalized_own_value"); + ows[2] = rs.getString("normalized_arabic_own_value"); + map.put(new Long(id), ows); } + } catch (SQLException ex) { + ex.printStackTrace(); + System.err.println(ex.getMessage()); } - catch (SQLException ex){ - ex.printStackTrace(); - //System.err.println(ex.getMessage()); - }finally{ - return map; - } + return map; } public static void main(String[] args) { - NormalizeOW.execute("all"); + int rc = 0; + if (args.length > 1 && args.length < 4) { + String user = args[1]; + String pw = (args.length == 3) ? args[2] : null; + if (args[0].equalsIgnoreCase("fix")) { + //rc = repair(user, pw); + } else { + execute("all", user, pw); + } + } else { + System.out.println("Parameter/s not found! Should be: mode(SHOW/FIX), mysql_user, mysql_password"); + System.out.println(" got: "+args.toString() + "("+args.length+")"); + System.exit(1); + } + System.exit(rc); + + //NormalizeOW.execute("all"); /* String arg = args[0]; if(StringUtils.isNotEmpty(arg)){