Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/OldNormalizerUtils.java @ 71:aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
108: arabic normalization is not applied
Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/108
author | casties |
---|---|
date | Thu, 02 Feb 2017 17:58:52 +0100 |
parents | |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; public class OldNormalizerUtils { public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>(); static{ List<String> list; StringBuilder sb = new StringBuilder(); list = new ArrayList<String>(); Character c = 0x1E6F; sb.append(c); list.add(sb.toString());//ṯ list.add("th"); wildCardStringMap.put("T", list); list = new ArrayList<String>(); c = 0x1E2b; list.add(c + "");//ḫ list.add("kh"); wildCardStringMap.put("H", list); list = new ArrayList<String>(); c = 0x1E0f; list.add(c + "");//ḏ list.add("dh"); wildCardStringMap.put("D", list); list = new ArrayList<String>(); c = 0x0161; list.add(c + "");//š list.add("sh"); wildCardStringMap.put("S", list); list = new ArrayList<String>(); c = 0x0121; list.add(c + "");//ġ list.add("gh"); wildCardStringMap.put("G", list); list = new ArrayList<String>(); c = 0x1E97; list.add("a" + c + " ");//aẗSPACE list.add("at "); list.add("ah "); list.add("a "); wildCardStringMap.put("A ", list); list = new ArrayList<String>(); c = 0x1ef3; list.add(c + "");//ỳ c = 0x00E1; list.add(c + "");//á c = 0x0101; list.add(c + "");//ā c = 0x00E0; list.add(c + "");//à /* //Chantal list for A c = 0x0065; list.add(c + "");//e c = 0x0101; list.add(c + "");//ā c = 0x00E2; list.add(c + "");//â */ wildCardStringMap.put("A", list); /* list = new ArrayList<String>(); c = 0x0062; list.add(c + "");//b c = 0x0070; list.add(c + "");//p wildCardStringMap.put("B", list); */ } public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>(); // " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ public static Character[] apostrophes = { 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF }; //IN: Aa Áá Àà Ââ Ǎǎ Ăă Ãã Ảả Ȧȧ Ạạ Ää Åå Ḁḁ Āā Ąą //OUT: ᶏ Ⱥⱥ Ȁȁ Ấấ Ầầ Ẫẫ Ẩẩ Ậậ Ắắ Ằằ Ẵẵ Ẳẳ Ặặ Ǻǻ Ǡǡ Ǟǟ Ȁȁ Ȃȃ public static Character[] AList = { 0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD, 0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226, 0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00, 0x1E01, 0x100, 0x101, 0x104, 0x105 }; static{ wildCardCharMap.put("", apostrophes); wildCardCharMap.put("A", AList); //IN: Bb Ḃḃ Ḅḅ Ḇḇ Ɓɓ ʙ Bb //OUT: Ƃƃ ᵬ ᶀ ʙ Bb ȸ Ƀƀ Character[] BList = { 0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06, 0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42, }; wildCardCharMap.put("B", BList); //Ćć Ĉĉ Čč Ċċ C̄c̄ Ç(ç problem with this) Ḉḉ Ȼȼ Ƈƈ ɕ ᴄ Cc Character[] CList = { 0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D, 0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B, 0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43 }; wildCardCharMap.put("C", CList); //IN: Dd Ďď Ḋḋ Ḑḑ Ḍḍ Ḓḓ Ḏḏ Dd //OUT: Đđ D̦d̦ Ɖɖ Ɗɗ Ƌƌ ᵭ ᶁ ᶑ ȡ ᴅ Character[] DList = { 0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10, 0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E, 0x1E0F, 0xFF24, 0xFF44 }; wildCardCharMap.put("D", DList); //IN: Ee Éé Èè Êê Ḙḙ Ěě Ĕĕ Ẽẽ Ḛḛ Ẻẻ Ėė Ëë Ēē Ȩȩ Ęę Ȅȅ Ếế Ềề Ễễ Ểể Ḝḝ Ḗḗ Ḕḕ Ȇȇ Ẹẹ Ệệ ᴇ Ee //OUT: Ææ Ǽǽ Ǣǣ Œœ ᶒ Ɇɇ Character[] EList = { 0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA, 0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115, 0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB, 0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228, 0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF, 0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3, 0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15, 0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7, 0x1D07, 0xFF25, 0xFF45 }; wildCardCharMap.put("E", EList); //Ii Íí Ìì Ĭĭ Îî Ǐǐ Ïï Ḯḯ Ĩĩ Įį Īī Ỉỉ Ȉȉ Ȋȋ Ịị Ḭḭ Character[] IList = { 0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE, 0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128, 0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208, 0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D }; wildCardCharMap.put("I", IList); //IN: Gg Ǵǵ Ğğ Ĝĝ Ǧǧ Ġġ Ģģ Ḡḡ Ǥǥ Gg //OUT: Ɠɠ ᶃ ɢ Character[] GList = { 0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D, 0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21, 0x1E4, 0x1E5, 0xFF27, 0xFF47 }; wildCardCharMap.put("G", GList); //Nn Ńń Ǹǹ Ňň Ññ Ṅṅ Ņņ Ṇṇ Ṋṋ Ṉṉ Character[] NList = { 0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148, 0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46, 0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49 }; wildCardCharMap.put("N", NList); //H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ Character[] HList = { 0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27, 0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A, 0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68 }; wildCardCharMap.put("H", HList); //Oo Óó Òò Ŏŏ Ôô Ốố Ồồ Ỗỗ Ổổ Ǒǒ Öö Ȫȫ Őő Õõ Ṍṍ Ṏṏ Ȭȭ Ȯȯ Ȱȱ Øø Ǿǿ Ǫǫ Ǭǭ Ōō Ṓṓ Ṑṑ Ỏỏ Ȍȍ Ȏȏ Ơơ Ớớ Ờờ Ỡỡ Ởở Ợợ Ọọ Ộộ Character[] OLIST = { 0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4, 0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7, 0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B, 0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F, 0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE, 0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52, 0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D, 0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD, 0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC, 0x1ECD, 0x1ED8, 0x1ED9 }; wildCardCharMap.put("O", OLIST); Character[] RList = { 0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59, 0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B, 0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52 }; wildCardCharMap.put("R", RList); //IN: Ss Śś Ṥṥ Ŝŝ Šš Ṧṧ Ṡṡẛ Şş Ṣṣ Ṩṩ Șș S̩̩ //OUT: ᵴ ᶊ ʂ ȿ ꜱ Ss s Character[] SList = { 0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D, 0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F, 0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53 }; wildCardCharMap.put("S", SList); //IN: Tt Ťť Ṫṫ Ţţ Ṭṭ Țț Ṱṱ Ṯṯ Tt //OUT: Ŧŧ Ⱦⱦ Ƭƭ Ʈʈ T̈ẗ ᵵ ƫ ȶ ᶙ ᴛ Character[] TList = { 0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163, 0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E, 0x1E6F, 0xFF34, 0xFF54 }; wildCardCharMap.put("T", TList); //IN: Uu Úú Ùù Ŭŭ Ûû Ǔǔ Ůů Üü Ǘǘ Ǜǜ Ǚǚ Ǖǖ Űű Ũũ Ṹṹ Ųų Ūū //OUT: Ṻṻ Ủủ Ȕȕ Ȗȗ Ưư Ứứ Ừừ Ữữ Ửử Ựự Ụụ Ṳṳ Ṷṷ Ṵṵ Ʉʉ ᵾ ᶙ ᴜ Uu Character[] UList ={ 0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3, 0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9, 0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79, 0x172, 0x173, 0x16A, 0x16B }; wildCardCharMap.put("U", UList); Character[] VList = { 0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2, 0x28B, 0x1D20, 0xFF36, 0xFF56 }; wildCardCharMap.put("V", VList); //IN: Zz Źź Ẑẑ Žž Żż Ẓẓ Ẕẕ Ƶƶ Ȥȥ //OUT: Ⱬⱬ ᵶ ᶎ ʐ ʑ ɀ ᴢ Zz Character[] ZList = { 0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D, 0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94, 0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A }; wildCardCharMap.put("Z", ZList); } public static String normalize(String w) { return ArabicTranslitNormalizer.normalize(w); } public static String old_normalize(String w){ if(StringUtils.isEmpty(w)) return w; w = w.toLowerCase(); /* * Replacing combination of vowels */ for(String key : wildCardStringMap.keySet()){ List<String> list = wildCardStringMap.get(key); for(String term : list){ w = w.replace(term, key); } } for(String key : wildCardCharMap.keySet()){ Character[] list = wildCardCharMap.get(key); for(int i=0; i< list.length; i++){ w = w.replace(list[i] + "", key); } } return w.toLowerCase(); } public static String normalizedToCompare(String s1){ s1 = s1.replace("#", ""); s1 = s1.replace("-", ""); s1 = s1.replace("(", ""); s1 = s1.replace(")", ""); s1 = s1.replace("[", ""); s1 = s1.replace("]", ""); s1 = s1.replace("_", ""); return s1; } public static void main(String[] args){ String s = OldNormalizerUtils.normalize("ṯ"); System.out.println(s); } }