Mercurial > hg > openmind
changeset 71:aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
108: arabic normalization is not applied
Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/108
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/cache/WrapperService.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/cache/WrapperService.java Thu Feb 02 17:58:52 2017 +0100 @@ -25,7 +25,6 @@ import org.mpi.openmind.repository.services.PersistenceService; import org.mpi.openmind.repository.services.utils.AttributeFilter; import org.mpi.openmind.repository.services.utils.EditIntent; -import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer; import org.mpi.openmind.repository.utils.ImportOM3Util; import org.mpi.openmind.repository.utils.NormalizerUtils; import org.mpi.openmind.repository.utils.RomanizationLoC; @@ -161,7 +160,7 @@ int count = 0; if (StringUtils.isNotEmpty(term)) { // TODO: better normalization - String normalizedTerm = ArabicTranslitNormalizer.normalize(term); + String normalizedTerm = NormalizerUtils.normalize(term); for (AttributeFilter filter : filters) { if (mustBreak) { break;
--- a/src/main/java/org/mpi/openmind/repository/bo/Node.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/bo/Node.java Thu Feb 02 17:58:52 2017 +0100 @@ -21,7 +21,6 @@ import javax.persistence.Transient; import org.apache.commons.codec.binary.Base64; -import org.mpi.openmind.repository.utils.ArabicNormalizerUtils; import org.mpi.openmind.repository.utils.NormalizerUtils; import org.mpi.openmind.repository.utils.RomanizationLoC; @@ -198,12 +197,12 @@ public void setOwnValue(String ownValue) { this.ownValue = ownValue; this.normalizedOwnValue = NormalizerUtils.normalize(ownValue); - this.normalizedArabicOwnValue = ArabicNormalizerUtils.normalize(ownValue); + this.normalizedArabicOwnValue = NormalizerUtils.normalizeArabic(ownValue); } public void autoNormalize(){ this.normalizedOwnValue = NormalizerUtils.normalize(ownValue); - this.normalizedArabicOwnValue = ArabicNormalizerUtils.normalize(ownValue); + this.normalizedArabicOwnValue = NormalizerUtils.normalizeArabic(ownValue); } public String getRomanizationLoC(){
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizer.java Thu Feb 02 17:58:52 2017 +0100 @@ -0,0 +1,38 @@ +package org.mpi.openmind.repository.utils; + +import java.text.Normalizer; +import java.text.Normalizer.Form; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; + +/** + * @author casties + * + */ +public class ArabicNormalizer { + + /** match all of Unicode mark category */ + protected static Pattern markPattern = Pattern.compile("\\p{M}+"); + + /** + * Returns String of normalized arabic. + * + * Normalization means de-vowelisation using Unicode tables. + * Removes all Unicode mark characters from decomposed form. + * + * @param text + * @return + */ + public static String normalize(String text) { + if (StringUtils.isEmpty(text)) { + return text; + } + + // remove vowels by de-composing and removing diacritical marks + text = Normalizer.normalize(text, Form.NFKD); + text = markPattern.matcher(text).replaceAll(""); + + return text; + } +}
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizerUtils.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizerUtils.java Thu Feb 02 17:58:52 2017 +0100 @@ -1,6 +1,5 @@ package org.mpi.openmind.repository.utils; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -78,16 +77,9 @@ if(StringUtils.isEmpty(w)) return w; - /* * Replacing combination of vowels - - for(String key : wildCardStringMap.keySet()){ - List<String> list = wildCardStringMap.get(key); - for(String term : list){ - w = w.replace(term, key); - } - }*/ + */ for(String key : wildCardCharMap.keySet()){ Character[] list = wildCardCharMap.get(key);
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Thu Feb 02 17:58:52 2017 +0100 @@ -2,11 +2,12 @@ import java.text.Normalizer; import java.text.Normalizer.Form; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Pattern; +import org.apache.commons.lang.StringUtils; + /** * @author casties * @@ -15,49 +16,57 @@ */ public class ArabicTranslitNormalizer { - protected static Map<String, Pattern> apostrophePatterns = new HashMap<String, Pattern>(); + protected static List<ReplacementPattern> apostrophePatterns = new ArrayList<ReplacementPattern>(); static { // `, ʿ, ʾ, ‘, ’ -> ' //apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019")); - // remove `, ʿ, ʾ, ‘, ’, ' - apostrophePatterns.put("", Pattern.compile("'|\u0060|\u02BE|\u02BF|\u2018|\u2019")); + // remove apostrophes `, ʿ, ʾ, ‘, ’, ' + apostrophePatterns.add(new ReplacementPattern("", Pattern.compile("'|\u0060|\u02BE|\u02BF|\u2018|\u2019"))); } - protected static Map<String, Pattern> twoletterPatterns = new HashMap<String, Pattern>(); + protected static List<ReplacementPattern> twoletterPatterns = new ArrayList<ReplacementPattern>(); static { - twoletterPatterns.put("j", Pattern.compile("ch")); - twoletterPatterns.put("j", Pattern.compile("dj")); - twoletterPatterns.put("t", Pattern.compile("th")); - twoletterPatterns.put("h", Pattern.compile("kh")); - twoletterPatterns.put("d", Pattern.compile("dh")); - twoletterPatterns.put("s", Pattern.compile("sh")); - twoletterPatterns.put("g", Pattern.compile("gh")); + twoletterPatterns.add(new ReplacementPattern("j", Pattern.compile("ch"))); + twoletterPatterns.add(new ReplacementPattern("j", Pattern.compile("dj"))); + twoletterPatterns.add(new ReplacementPattern("t", Pattern.compile("th"))); + twoletterPatterns.add(new ReplacementPattern("h", Pattern.compile("kh"))); + twoletterPatterns.add(new ReplacementPattern("d", Pattern.compile("dh"))); + twoletterPatterns.add(new ReplacementPattern("s", Pattern.compile("sh"))); + twoletterPatterns.add(new ReplacementPattern("g", Pattern.compile("gh"))); } - protected static Map<String, Pattern> wordpartPatterns = new HashMap<String, Pattern>(); + protected static List<ReplacementPattern> wordpartPatterns = new ArrayList<ReplacementPattern>(); static { // aẗ\b, at\b, ah\b -> a - wordpartPatterns.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b")); + wordpartPatterns.add(new ReplacementPattern("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"))); // 'abd + space -> 'abd - //wordpartPatterns.put("'abd", Pattern.compile("'abd ")); - wordpartPatterns.put("abd", Pattern.compile("abd ")); + // now without apostrophe + wordpartPatterns.add(new ReplacementPattern("abd", Pattern.compile("abd "))); } - protected static Map<String, Pattern> letterdiacritPatterns = new HashMap<String, Pattern>(); + protected static List<ReplacementPattern> letterdiacritPatterns = new ArrayList<ReplacementPattern>(); static { // ỳ -> a - letterdiacritPatterns.put("a", Pattern.compile("\u1EF3")); + letterdiacritPatterns.add(new ReplacementPattern("a", Pattern.compile("\u1EF3"))); } - protected static Map<String, Pattern> letterPatterns = new HashMap<String, Pattern>(); + protected static List<ReplacementPattern> letterPatterns = new ArrayList<ReplacementPattern>(); static { - letterPatterns.put("j", Pattern.compile("g|c")); + letterPatterns.add(new ReplacementPattern("j", Pattern.compile("g|c"))); } protected static Pattern diacriticsPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + /** + * Returns String normalized according to arabic transliteration rules. + * + * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit + * + * @param text + * @return + */ public static String normalize(String text) { - if (text == null || text.isEmpty()) { + if (StringUtils.isEmpty(text)) { return text; } @@ -65,31 +74,23 @@ text = text.toLowerCase(); // replace "apostrophes" - for (Entry<String, Pattern> entry : apostrophePatterns.entrySet()) { - Pattern pattern = entry.getValue(); - String replacement = entry.getKey(); - text = pattern.matcher(text).replaceAll(replacement); + for (ReplacementPattern entry : apostrophePatterns) { + text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace two-letter combinations - for (Entry<String, Pattern> entry : twoletterPatterns.entrySet()) { - Pattern pattern = entry.getValue(); - String replacement = entry.getKey(); - text = pattern.matcher(text).replaceAll(replacement); + for (ReplacementPattern entry : twoletterPatterns) { + text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace word-parts - for (Entry<String, Pattern> entry : wordpartPatterns.entrySet()) { - Pattern pattern = entry.getValue(); - String replacement = entry.getKey(); - text = pattern.matcher(text).replaceAll(replacement); + for (ReplacementPattern entry : wordpartPatterns) { + text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace letters with diacritics - for (Entry<String, Pattern> entry : letterdiacritPatterns.entrySet()) { - Pattern pattern = entry.getValue(); - String replacement = entry.getKey(); - text = pattern.matcher(text).replaceAll(replacement); + for (ReplacementPattern entry : letterdiacritPatterns) { + text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // remove diacritics by de-composing and removing diacritical marks @@ -97,10 +98,8 @@ text = diacriticsPattern.matcher(text).replaceAll(""); // replace letters - for (Entry<String, Pattern> entry : letterPatterns.entrySet()) { - Pattern pattern = entry.getValue(); - String replacement = entry.getKey(); - text = pattern.matcher(text).replaceAll(replacement); + for (ReplacementPattern entry : letterPatterns) { + text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } return text;
--- a/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java Thu Feb 02 17:58:52 2017 +0100 @@ -1,291 +1,27 @@ package org.mpi.openmind.repository.utils; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang.StringUtils; - public class NormalizerUtils { - public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>(); - static{ - List<String> list; - StringBuilder sb = new StringBuilder(); - - list = new ArrayList<String>(); - Character c = 0x1E6F; - sb.append(c); - list.add(sb.toString());//ṯ - list.add("th"); - wildCardStringMap.put("T", list); - - list = new ArrayList<String>(); - c = 0x1E2b; - list.add(c + "");//ḫ - list.add("kh"); - wildCardStringMap.put("H", list); - - list = new ArrayList<String>(); - c = 0x1E0f; - list.add(c + "");//ḏ - list.add("dh"); - wildCardStringMap.put("D", list); - - list = new ArrayList<String>(); - c = 0x0161; - list.add(c + "");//š - list.add("sh"); - wildCardStringMap.put("S", list); - - list = new ArrayList<String>(); - c = 0x0121; - list.add(c + "");//ġ - list.add("gh"); - wildCardStringMap.put("G", list); - - list = new ArrayList<String>(); - c = 0x1E97; - list.add("a" + c + " ");//aẗSPACE - list.add("at "); - list.add("ah "); - list.add("a "); - wildCardStringMap.put("A ", list); - - list = new ArrayList<String>(); - c = 0x1ef3; - list.add(c + "");//ỳ - c = 0x00E1; - list.add(c + "");//á - c = 0x0101; - list.add(c + "");//ā - c = 0x00E0; - list.add(c + "");//à - /* - //Chantal list for A - c = 0x0065; - list.add(c + "");//e - c = 0x0101; - list.add(c + "");//ā - c = 0x00E2; - list.add(c + "");//â - */ - wildCardStringMap.put("A", list); - - /* - list = new ArrayList<String>(); - c = 0x0062; - list.add(c + "");//b - c = 0x0070; - list.add(c + "");//p - wildCardStringMap.put("B", list); - */ + public static String normalize(String w) { + String atn = ArabicTranslitNormalizer.normalize(w); + String an = ArabicNormalizer.normalize(atn); + return an; } - public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>(); - - // " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ - public static Character[] apostrophes = { - 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF }; - //IN: Aa Áá Àà Ââ Ǎǎ Ăă Ãã Ảả Ȧȧ Ạạ Ää Åå Ḁḁ Āā Ąą - //OUT: ᶏ Ⱥⱥ Ȁȁ Ấấ Ầầ Ẫẫ Ẩẩ Ậậ Ắắ Ằằ Ẵẵ Ẳẳ Ặặ Ǻǻ Ǡǡ Ǟǟ Ȁȁ Ȃȃ - public static Character[] AList = { - 0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD, - 0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226, - 0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00, - 0x1E01, 0x100, 0x101, 0x104, 0x105 }; - - static{ - - wildCardCharMap.put("", apostrophes); - wildCardCharMap.put("A", AList); - - //IN: Bb Ḃḃ Ḅḅ Ḇḇ Ɓɓ ʙ Bb - //OUT: Ƃƃ ᵬ ᶀ ʙ Bb ȸ Ƀƀ - Character[] BList = { - 0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06, - 0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42, - }; - wildCardCharMap.put("B", BList); - - //Ćć Ĉĉ Čč Ċċ C̄c̄ Ç(ç problem with this) Ḉḉ Ȼȼ Ƈƈ ɕ ᴄ Cc - Character[] CList = { - 0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D, - 0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B, - 0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43 - }; - wildCardCharMap.put("C", CList); - - //IN: Dd Ďď Ḋḋ Ḑḑ Ḍḍ Ḓḓ Ḏḏ Dd - //OUT: Đđ D̦d̦ Ɖɖ Ɗɗ Ƌƌ ᵭ ᶁ ᶑ ȡ ᴅ - Character[] DList = { - 0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10, - 0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E, - 0x1E0F, 0xFF24, 0xFF44 - }; - wildCardCharMap.put("D", DList); - - //IN: Ee Éé Èè Êê Ḙḙ Ěě Ĕĕ Ẽẽ Ḛḛ Ẻẻ Ėė Ëë Ēē Ȩȩ Ęę Ȅȅ Ếế Ềề Ễễ Ểể Ḝḝ Ḗḗ Ḕḕ Ȇȇ Ẹẹ Ệệ ᴇ Ee - //OUT: Ææ Ǽǽ Ǣǣ Œœ ᶒ Ɇɇ - Character[] EList = { - 0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA, - 0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115, - 0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB, - 0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228, - 0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF, - 0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3, - 0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15, - 0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7, - 0x1D07, 0xFF25, 0xFF45 - }; - wildCardCharMap.put("E", EList); - - //Ii Íí Ìì Ĭĭ Îî Ǐǐ Ïï Ḯḯ Ĩĩ Įį Īī Ỉỉ Ȉȉ Ȋȋ Ịị Ḭḭ - Character[] IList = { - 0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE, - 0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128, - 0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208, - 0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D - }; - wildCardCharMap.put("I", IList); - - //IN: Gg Ǵǵ Ğğ Ĝĝ Ǧǧ Ġġ Ģģ Ḡḡ Ǥǥ Gg - //OUT: Ɠɠ ᶃ ɢ - Character[] GList = { - 0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D, - 0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21, - 0x1E4, 0x1E5, 0xFF27, 0xFF47 - }; - wildCardCharMap.put("G", GList); - - //Nn Ńń Ǹǹ Ňň Ññ Ṅṅ Ņņ Ṇṇ Ṋṋ Ṉṉ - Character[] NList = { - 0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148, - 0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46, - 0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49 - }; - wildCardCharMap.put("N", NList); - - //H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ - Character[] HList = { - 0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27, - 0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A, - 0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68 - }; - wildCardCharMap.put("H", HList); - - //Oo Óó Òò Ŏŏ Ôô Ốố Ồồ Ỗỗ Ổổ Ǒǒ Öö Ȫȫ Őő Õõ Ṍṍ Ṏṏ Ȭȭ Ȯȯ Ȱȱ Øø Ǿǿ Ǫǫ Ǭǭ Ōō Ṓṓ Ṑṑ Ỏỏ Ȍȍ Ȏȏ Ơơ Ớớ Ờờ Ỡỡ Ởở Ợợ Ọọ Ộộ - Character[] OLIST = { - 0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4, - 0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7, - 0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B, - 0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F, - 0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE, - 0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52, - 0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D, - 0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD, - 0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC, - 0x1ECD, 0x1ED8, 0x1ED9 - }; - wildCardCharMap.put("O", OLIST); - - Character[] RList = { - 0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59, - 0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B, - 0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52 - }; - wildCardCharMap.put("R", RList); - - - //IN: Ss Śś Ṥṥ Ŝŝ Šš Ṧṧ Ṡṡẛ Şş Ṣṣ Ṩṩ Șș S̩̩ - //OUT: ᵴ ᶊ ʂ ȿ ꜱ Ss s - Character[] SList = { - 0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D, - 0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F, - 0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53 - }; - wildCardCharMap.put("S", SList); - - - //IN: Tt Ťť Ṫṫ Ţţ Ṭṭ Țț Ṱṱ Ṯṯ Tt - //OUT: Ŧŧ Ⱦⱦ Ƭƭ Ʈʈ T̈ẗ ᵵ ƫ ȶ ᶙ ᴛ - Character[] TList = { - 0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163, - 0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E, - 0x1E6F, 0xFF34, 0xFF54 - }; - wildCardCharMap.put("T", TList); - - //IN: Uu Úú Ùù Ŭŭ Ûû Ǔǔ Ůů Üü Ǘǘ Ǜǜ Ǚǚ Ǖǖ Űű Ũũ Ṹṹ Ųų Ūū - //OUT: Ṻṻ Ủủ Ȕȕ Ȗȗ Ưư Ứứ Ừừ Ữữ Ửử Ựự Ụụ Ṳṳ Ṷṷ Ṵṵ Ʉʉ ᵾ ᶙ ᴜ Uu - Character[] UList ={ - 0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3, - 0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9, - 0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79, - 0x172, 0x173, 0x16A, 0x16B - }; - wildCardCharMap.put("U", UList); - - Character[] VList = { - 0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2, - 0x28B, 0x1D20, 0xFF36, 0xFF56 - }; - wildCardCharMap.put("V", VList); - - //IN: Zz Źź Ẑẑ Žž Żż Ẓẓ Ẕẕ Ƶƶ Ȥȥ - //OUT: Ⱬⱬ ᵶ ᶎ ʐ ʑ ɀ ᴢ Zz - Character[] ZList = { - 0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D, - 0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94, - 0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A - }; - wildCardCharMap.put("Z", ZList); + /** + * Returns String normalized according to arabic transliteration rules. + * + * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit + * + * @param w + * @return + */ + public static String normalizeArabicTranslit(String w) { + return ArabicTranslitNormalizer.normalize(w); } - public static String normalize(String w) { - return ArabicTranslitNormalizer.normalize(w); - } - - public static String old_normalize(String w){ - if(StringUtils.isEmpty(w)) - return w; - - w = w.toLowerCase(); - /* - * Replacing combination of vowels - */ - for(String key : wildCardStringMap.keySet()){ - List<String> list = wildCardStringMap.get(key); - for(String term : list){ - w = w.replace(term, key); - } - } - - for(String key : wildCardCharMap.keySet()){ - Character[] list = wildCardCharMap.get(key); - for(int i=0; i< list.length; i++){ - w = w.replace(list[i] + "", key); - } - } - return w.toLowerCase(); - } - - public static String normalizedToCompare(String s1){ - s1 = s1.replace("#", ""); - s1 = s1.replace("-", ""); - s1 = s1.replace("(", ""); - s1 = s1.replace(")", ""); - s1 = s1.replace("[", ""); - s1 = s1.replace("]", ""); - s1 = s1.replace("_", ""); - - return s1; - } - - public static void main(String[] args){ - String s = NormalizerUtils.normalize("ṯ"); - System.out.println(s); + public static String normalizeArabic(String w) { + return ArabicNormalizer.normalize(w); } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/repository/utils/OldNormalizerUtils.java Thu Feb 02 17:58:52 2017 +0100 @@ -0,0 +1,291 @@ +package org.mpi.openmind.repository.utils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; + +public class OldNormalizerUtils { + + public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>(); + + static{ + List<String> list; + StringBuilder sb = new StringBuilder(); + + list = new ArrayList<String>(); + Character c = 0x1E6F; + sb.append(c); + list.add(sb.toString());//ṯ + list.add("th"); + wildCardStringMap.put("T", list); + + list = new ArrayList<String>(); + c = 0x1E2b; + list.add(c + "");//ḫ + list.add("kh"); + wildCardStringMap.put("H", list); + + list = new ArrayList<String>(); + c = 0x1E0f; + list.add(c + "");//ḏ + list.add("dh"); + wildCardStringMap.put("D", list); + + list = new ArrayList<String>(); + c = 0x0161; + list.add(c + "");//š + list.add("sh"); + wildCardStringMap.put("S", list); + + list = new ArrayList<String>(); + c = 0x0121; + list.add(c + "");//ġ + list.add("gh"); + wildCardStringMap.put("G", list); + + list = new ArrayList<String>(); + c = 0x1E97; + list.add("a" + c + " ");//aẗSPACE + list.add("at "); + list.add("ah "); + list.add("a "); + wildCardStringMap.put("A ", list); + + list = new ArrayList<String>(); + c = 0x1ef3; + list.add(c + "");//ỳ + c = 0x00E1; + list.add(c + "");//á + c = 0x0101; + list.add(c + "");//ā + c = 0x00E0; + list.add(c + "");//à + /* + //Chantal list for A + c = 0x0065; + list.add(c + "");//e + c = 0x0101; + list.add(c + "");//ā + c = 0x00E2; + list.add(c + "");//â + */ + wildCardStringMap.put("A", list); + + /* + list = new ArrayList<String>(); + c = 0x0062; + list.add(c + "");//b + c = 0x0070; + list.add(c + "");//p + wildCardStringMap.put("B", list); + */ + } + + public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>(); + + // " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ + public static Character[] apostrophes = { + 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF }; + //IN: Aa Áá Àà Ââ Ǎǎ Ăă Ãã Ảả Ȧȧ Ạạ Ää Åå Ḁḁ Āā Ąą + //OUT: ᶏ Ⱥⱥ Ȁȁ Ấấ Ầầ Ẫẫ Ẩẩ Ậậ Ắắ Ằằ Ẵẵ Ẳẳ Ặặ Ǻǻ Ǡǡ Ǟǟ Ȁȁ Ȃȃ + public static Character[] AList = { + 0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD, + 0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226, + 0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00, + 0x1E01, 0x100, 0x101, 0x104, 0x105 }; + + static{ + + wildCardCharMap.put("", apostrophes); + wildCardCharMap.put("A", AList); + + //IN: Bb Ḃḃ Ḅḅ Ḇḇ Ɓɓ ʙ Bb + //OUT: Ƃƃ ᵬ ᶀ ʙ Bb ȸ Ƀƀ + Character[] BList = { + 0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06, + 0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42, + }; + wildCardCharMap.put("B", BList); + + //Ćć Ĉĉ Čč Ċċ C̄c̄ Ç(ç problem with this) Ḉḉ Ȼȼ Ƈƈ ɕ ᴄ Cc + Character[] CList = { + 0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D, + 0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B, + 0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43 + }; + wildCardCharMap.put("C", CList); + + //IN: Dd Ďď Ḋḋ Ḑḑ Ḍḍ Ḓḓ Ḏḏ Dd + //OUT: Đđ D̦d̦ Ɖɖ Ɗɗ Ƌƌ ᵭ ᶁ ᶑ ȡ ᴅ + Character[] DList = { + 0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10, + 0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E, + 0x1E0F, 0xFF24, 0xFF44 + }; + wildCardCharMap.put("D", DList); + + //IN: Ee Éé Èè Êê Ḙḙ Ěě Ĕĕ Ẽẽ Ḛḛ Ẻẻ Ėė Ëë Ēē Ȩȩ Ęę Ȅȅ Ếế Ềề Ễễ Ểể Ḝḝ Ḗḗ Ḕḕ Ȇȇ Ẹẹ Ệệ ᴇ Ee + //OUT: Ææ Ǽǽ Ǣǣ Œœ ᶒ Ɇɇ + Character[] EList = { + 0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA, + 0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115, + 0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB, + 0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228, + 0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF, + 0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3, + 0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15, + 0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7, + 0x1D07, 0xFF25, 0xFF45 + }; + wildCardCharMap.put("E", EList); + + //Ii Íí Ìì Ĭĭ Îî Ǐǐ Ïï Ḯḯ Ĩĩ Įį Īī Ỉỉ Ȉȉ Ȋȋ Ịị Ḭḭ + Character[] IList = { + 0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE, + 0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128, + 0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208, + 0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D + }; + wildCardCharMap.put("I", IList); + + //IN: Gg Ǵǵ Ğğ Ĝĝ Ǧǧ Ġġ Ģģ Ḡḡ Ǥǥ Gg + //OUT: Ɠɠ ᶃ ɢ + Character[] GList = { + 0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D, + 0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21, + 0x1E4, 0x1E5, 0xFF27, 0xFF47 + }; + wildCardCharMap.put("G", GList); + + //Nn Ńń Ǹǹ Ňň Ññ Ṅṅ Ņņ Ṇṇ Ṋṋ Ṉṉ + Character[] NList = { + 0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148, + 0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46, + 0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49 + }; + wildCardCharMap.put("N", NList); + + //H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ + Character[] HList = { + 0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27, + 0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A, + 0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68 + }; + wildCardCharMap.put("H", HList); + + //Oo Óó Òò Ŏŏ Ôô Ốố Ồồ Ỗỗ Ổổ Ǒǒ Öö Ȫȫ Őő Õõ Ṍṍ Ṏṏ Ȭȭ Ȯȯ Ȱȱ Øø Ǿǿ Ǫǫ Ǭǭ Ōō Ṓṓ Ṑṑ Ỏỏ Ȍȍ Ȏȏ Ơơ Ớớ Ờờ Ỡỡ Ởở Ợợ Ọọ Ộộ + Character[] OLIST = { + 0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4, + 0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7, + 0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B, + 0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F, + 0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE, + 0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52, + 0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D, + 0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD, + 0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC, + 0x1ECD, 0x1ED8, 0x1ED9 + }; + wildCardCharMap.put("O", OLIST); + + Character[] RList = { + 0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59, + 0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B, + 0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52 + }; + wildCardCharMap.put("R", RList); + + + //IN: Ss Śś Ṥṥ Ŝŝ Šš Ṧṧ Ṡṡẛ Şş Ṣṣ Ṩṩ Șș S̩̩ + //OUT: ᵴ ᶊ ʂ ȿ ꜱ Ss s + Character[] SList = { + 0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D, + 0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F, + 0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53 + }; + wildCardCharMap.put("S", SList); + + + //IN: Tt Ťť Ṫṫ Ţţ Ṭṭ Țț Ṱṱ Ṯṯ Tt + //OUT: Ŧŧ Ⱦⱦ Ƭƭ Ʈʈ T̈ẗ ᵵ ƫ ȶ ᶙ ᴛ + Character[] TList = { + 0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163, + 0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E, + 0x1E6F, 0xFF34, 0xFF54 + }; + wildCardCharMap.put("T", TList); + + //IN: Uu Úú Ùù Ŭŭ Ûû Ǔǔ Ůů Üü Ǘǘ Ǜǜ Ǚǚ Ǖǖ Űű Ũũ Ṹṹ Ųų Ūū + //OUT: Ṻṻ Ủủ Ȕȕ Ȗȗ Ưư Ứứ Ừừ Ữữ Ửử Ựự Ụụ Ṳṳ Ṷṷ Ṵṵ Ʉʉ ᵾ ᶙ ᴜ Uu + Character[] UList ={ + 0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3, + 0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9, + 0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79, + 0x172, 0x173, 0x16A, 0x16B + }; + wildCardCharMap.put("U", UList); + + Character[] VList = { + 0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2, + 0x28B, 0x1D20, 0xFF36, 0xFF56 + }; + wildCardCharMap.put("V", VList); + + //IN: Zz Źź Ẑẑ Žž Żż Ẓẓ Ẕẕ Ƶƶ Ȥȥ + //OUT: Ⱬⱬ ᵶ ᶎ ʐ ʑ ɀ ᴢ Zz + Character[] ZList = { + 0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D, + 0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94, + 0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A + }; + wildCardCharMap.put("Z", ZList); + } + + public static String normalize(String w) { + return ArabicTranslitNormalizer.normalize(w); + } + + public static String old_normalize(String w){ + if(StringUtils.isEmpty(w)) + return w; + + w = w.toLowerCase(); + /* + * Replacing combination of vowels + */ + for(String key : wildCardStringMap.keySet()){ + List<String> list = wildCardStringMap.get(key); + for(String term : list){ + w = w.replace(term, key); + } + } + + for(String key : wildCardCharMap.keySet()){ + Character[] list = wildCardCharMap.get(key); + for(int i=0; i< list.length; i++){ + w = w.replace(list[i] + "", key); + } + } + return w.toLowerCase(); + } + + public static String normalizedToCompare(String s1){ + s1 = s1.replace("#", ""); + s1 = s1.replace("-", ""); + s1 = s1.replace("(", ""); + s1 = s1.replace(")", ""); + s1 = s1.replace("[", ""); + s1 = s1.replace("]", ""); + s1 = s1.replace("_", ""); + + return s1; + } + + public static void main(String[] args){ + String s = OldNormalizerUtils.normalize("ṯ"); + System.out.println(s); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/repository/utils/ReplacementPattern.java Thu Feb 02 17:58:52 2017 +0100 @@ -0,0 +1,53 @@ +/** + * + */ +package org.mpi.openmind.repository.utils; + +import java.util.regex.Pattern; + +/** + * @author casties + * + */ +public class ReplacementPattern { + public Pattern pattern; + public String replacement; + + /** + * @param replacement + * @param pattern + */ + public ReplacementPattern(String replacement, Pattern pattern) { + super(); + this.pattern = pattern; + this.replacement = replacement; + } + + /** + * @return the pattern + */ + public Pattern getPattern() { + return pattern; + } + + /** + * @param pattern the pattern to set + */ + public void setPattern(Pattern pattern) { + this.pattern = pattern; + } + + /** + * @return the replacement + */ + public String getReplacement() { + return replacement; + } + + /** + * @param replacement the replacement to set + */ + public void setReplacement(String replacement) { + this.replacement = replacement; + } +}
--- a/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java Thu Feb 02 17:58:52 2017 +0100 @@ -58,7 +58,7 @@ public static char APOSTROPHE = 0x27; public static String apostrophesNormalization(String text){ String result = text; - for(Character apostrophe : NormalizerUtils.apostrophes){ + for(Character apostrophe : OldNormalizerUtils.apostrophes){ result = result.replace(apostrophe, APOSTROPHE); } return result; @@ -67,7 +67,7 @@ public static char a = 0x61; public static String aNormalization(String text){ String result = text; - for(Character item : NormalizerUtils.AList){ + for(Character item : OldNormalizerUtils.AList){ result = result.replace(item, a); } return result;
--- a/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Thu Feb 02 11:58:23 2017 +0100 +++ b/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Thu Feb 02 17:58:52 2017 +0100 @@ -9,8 +9,7 @@ import java.util.HashMap; import java.util.Map; -import org.mpi.openmind.repository.utils.ArabicNormalizerUtils; -import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer; +import org.mpi.openmind.repository.utils.NormalizerUtils; public class NormalizeOW { public static void execute(String type, String dbUser, String dbPw, boolean modify) { @@ -64,8 +63,8 @@ String ow = ows[0]; String oldNormalizedOW = ows[1]; String oldNormalizedArabicOW = ows[2]; - String normalizedOW = ArabicTranslitNormalizer.normalize(ow); - String normalizedArabicOW = ArabicNormalizerUtils.normalize(ow); + String normalizedOW = NormalizerUtils.normalize(ow); + String normalizedArabicOW = NormalizerUtils.normalizeArabic(ow); boolean changes = false; if (normalizedOW != null && !normalizedOW.equals(oldNormalizedOW)) { System.out.println("normOW changes (#"+cnt+" @"+id+"): old="+oldNormalizedOW+" new="+normalizedOW);