1
|
1 package org.mpi.openmind.repository.utils;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.HashMap;
|
|
5 import java.util.List;
|
|
6 import java.util.Map;
|
|
7
|
|
8 import org.apache.commons.lang.StringUtils;
|
|
9
|
|
10 public class NormalizerUtils {
|
|
11
|
|
12 public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>();
|
|
13
|
|
14 static{
|
|
15 List<String> list;
|
|
16 StringBuilder sb = new StringBuilder();
|
|
17
|
|
18 list = new ArrayList<String>();
|
|
19 Character c = 0x1E6F;
|
|
20 sb.append(c);
|
|
21 list.add(sb.toString());//ṯ
|
|
22 list.add("th");
|
|
23 wildCardStringMap.put("T", list);
|
|
24
|
|
25 list = new ArrayList<String>();
|
|
26 c = 0x1E2b;
|
|
27 list.add(c + "");//ḫ
|
|
28 list.add("kh");
|
|
29 wildCardStringMap.put("H", list);
|
|
30
|
|
31 list = new ArrayList<String>();
|
|
32 c = 0x1E0f;
|
|
33 list.add(c + "");//ḏ
|
|
34 list.add("dh");
|
|
35 wildCardStringMap.put("D", list);
|
|
36
|
|
37 list = new ArrayList<String>();
|
|
38 c = 0x0161;
|
|
39 list.add(c + "");//š
|
|
40 list.add("sh");
|
|
41 wildCardStringMap.put("S", list);
|
|
42
|
|
43 list = new ArrayList<String>();
|
|
44 c = 0x0121;
|
|
45 list.add(c + "");//ġ
|
|
46 list.add("gh");
|
|
47 wildCardStringMap.put("G", list);
|
|
48
|
|
49 list = new ArrayList<String>();
|
|
50 c = 0x1E97;
|
|
51 list.add("a" + c + " ");//aẗSPACE
|
|
52 list.add("at ");
|
|
53 list.add("ah ");
|
|
54 list.add("a ");
|
|
55 wildCardStringMap.put("A ", list);
|
|
56
|
|
57 list = new ArrayList<String>();
|
|
58 c = 0x1ef3;
|
|
59 list.add(c + "");//ỳ
|
|
60 c = 0x00E1;
|
|
61 list.add(c + "");//á
|
|
62 c = 0x0101;
|
|
63 list.add(c + "");//ā
|
|
64 c = 0x00E0;
|
|
65 list.add(c + "");//à
|
|
66 /*
|
|
67 //Chantal list for A
|
|
68 c = 0x0065;
|
|
69 list.add(c + "");//e
|
|
70 c = 0x0101;
|
|
71 list.add(c + "");//ā
|
|
72 c = 0x00E2;
|
|
73 list.add(c + "");//â
|
|
74 */
|
|
75 wildCardStringMap.put("A", list);
|
|
76
|
|
77 /*
|
|
78 list = new ArrayList<String>();
|
|
79 c = 0x0062;
|
|
80 list.add(c + "");//b
|
|
81 c = 0x0070;
|
|
82 list.add(c + "");//p
|
|
83 wildCardStringMap.put("B", list);
|
|
84 */
|
|
85 }
|
|
86
|
|
87 public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>();
|
|
88
|
|
89 // " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ
|
|
90 public static Character[] apostrophes = {
|
|
91 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF };
|
|
92 //IN: Aa Áá Àà Ââ Ǎǎ Ăă Ãã Ảả Ȧȧ Ạạ Ää Åå Ḁḁ Āā Ąą
|
|
93 //OUT: ᶏ Ⱥⱥ Ȁȁ Ấấ Ầầ Ẫẫ Ẩẩ Ậậ Ắắ Ằằ Ẵẵ Ẳẳ Ặặ Ǻǻ Ǡǡ Ǟǟ Ȁȁ Ȃȃ
|
|
94 public static Character[] AList = {
|
|
95 0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD,
|
|
96 0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226,
|
|
97 0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00,
|
|
98 0x1E01, 0x100, 0x101, 0x104, 0x105 };
|
|
99
|
|
100 static{
|
|
101
|
|
102 wildCardCharMap.put("", apostrophes);
|
|
103 wildCardCharMap.put("A", AList);
|
|
104
|
|
105 //IN: Bb Ḃḃ Ḅḅ Ḇḇ Ɓɓ ʙ Bb
|
|
106 //OUT: Ƃƃ ᵬ ᶀ ʙ Bb ȸ Ƀƀ
|
|
107 Character[] BList = {
|
|
108 0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06,
|
|
109 0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42,
|
|
110 };
|
|
111 wildCardCharMap.put("B", BList);
|
|
112
|
|
113 //Ćć Ĉĉ Čč Ċċ C̄c̄ Ç(ç problem with this) Ḉḉ Ȼȼ Ƈƈ ɕ ᴄ Cc
|
|
114 Character[] CList = {
|
|
115 0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D,
|
|
116 0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B,
|
|
117 0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43
|
|
118 };
|
|
119 wildCardCharMap.put("C", CList);
|
|
120
|
|
121 //IN: Dd Ďď Ḋḋ Ḑḑ Ḍḍ Ḓḓ Ḏḏ Dd
|
|
122 //OUT: Đđ D̦d̦ Ɖɖ Ɗɗ Ƌƌ ᵭ ᶁ ᶑ ȡ ᴅ
|
|
123 Character[] DList = {
|
|
124 0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10,
|
|
125 0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E,
|
|
126 0x1E0F, 0xFF24, 0xFF44
|
|
127 };
|
|
128 wildCardCharMap.put("D", DList);
|
|
129
|
|
130 //IN: Ee Éé Èè Êê Ḙḙ Ěě Ĕĕ Ẽẽ Ḛḛ Ẻẻ Ėė Ëë Ēē Ȩȩ Ęę Ȅȅ Ếế Ềề Ễễ Ểể Ḝḝ Ḗḗ Ḕḕ Ȇȇ Ẹẹ Ệệ ᴇ Ee
|
|
131 //OUT: Ææ Ǽǽ Ǣǣ Œœ ᶒ Ɇɇ
|
|
132 Character[] EList = {
|
|
133 0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA,
|
|
134 0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115,
|
|
135 0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB,
|
|
136 0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228,
|
|
137 0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF,
|
|
138 0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3,
|
|
139 0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15,
|
|
140 0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7,
|
|
141 0x1D07, 0xFF25, 0xFF45
|
|
142 };
|
|
143 wildCardCharMap.put("E", EList);
|
|
144
|
|
145 //Ii Íí Ìì Ĭĭ Îî Ǐǐ Ïï Ḯḯ Ĩĩ Įį Īī Ỉỉ Ȉȉ Ȋȋ Ịị Ḭḭ
|
|
146 Character[] IList = {
|
|
147 0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE,
|
|
148 0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128,
|
|
149 0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208,
|
|
150 0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D
|
|
151 };
|
|
152 wildCardCharMap.put("I", IList);
|
|
153
|
|
154 //IN: Gg Ǵǵ Ğğ Ĝĝ Ǧǧ Ġġ Ģģ Ḡḡ Ǥǥ Gg
|
|
155 //OUT: Ɠɠ ᶃ ɢ
|
|
156 Character[] GList = {
|
|
157 0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D,
|
|
158 0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21,
|
|
159 0x1E4, 0x1E5, 0xFF27, 0xFF47
|
|
160 };
|
|
161 wildCardCharMap.put("G", GList);
|
|
162
|
|
163 //Nn Ńń Ǹǹ Ňň Ññ Ṅṅ Ņņ Ṇṇ Ṋṋ Ṉṉ
|
|
164 Character[] NList = {
|
|
165 0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148,
|
|
166 0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46,
|
|
167 0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49
|
|
168 };
|
|
169 wildCardCharMap.put("N", NList);
|
|
170
|
|
171 //H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ
|
|
172 Character[] HList = {
|
|
173 0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27,
|
|
174 0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A,
|
|
175 0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68
|
|
176 };
|
|
177 wildCardCharMap.put("H", HList);
|
|
178
|
|
179 //Oo Óó Òò Ŏŏ Ôô Ốố Ồồ Ỗỗ Ổổ Ǒǒ Öö Ȫȫ Őő Õõ Ṍṍ Ṏṏ Ȭȭ Ȯȯ Ȱȱ Øø Ǿǿ Ǫǫ Ǭǭ Ōō Ṓṓ Ṑṑ Ỏỏ Ȍȍ Ȏȏ Ơơ Ớớ Ờờ Ỡỡ Ởở Ợợ Ọọ Ộộ
|
|
180 Character[] OLIST = {
|
|
181 0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4,
|
|
182 0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7,
|
|
183 0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B,
|
|
184 0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F,
|
|
185 0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE,
|
|
186 0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52,
|
|
187 0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D,
|
|
188 0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD,
|
|
189 0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC,
|
|
190 0x1ECD, 0x1ED8, 0x1ED9
|
|
191 };
|
|
192 wildCardCharMap.put("O", OLIST);
|
|
193
|
|
194 Character[] RList = {
|
|
195 0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59,
|
|
196 0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B,
|
|
197 0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52
|
|
198 };
|
|
199 wildCardCharMap.put("R", RList);
|
|
200
|
|
201
|
|
202 //IN: Ss Śś Ṥṥ Ŝŝ Šš Ṧṧ Ṡṡẛ Şş Ṣṣ Ṩṩ Șș S̩̩
|
|
203 //OUT: ᵴ ᶊ ʂ ȿ ꜱ Ss s
|
|
204 Character[] SList = {
|
|
205 0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D,
|
|
206 0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F,
|
|
207 0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53
|
|
208 };
|
|
209 wildCardCharMap.put("S", SList);
|
|
210
|
|
211
|
|
212 //IN: Tt Ťť Ṫṫ Ţţ Ṭṭ Țț Ṱṱ Ṯṯ Tt
|
|
213 //OUT: Ŧŧ Ⱦⱦ Ƭƭ Ʈʈ T̈ẗ ᵵ ƫ ȶ ᶙ ᴛ
|
|
214 Character[] TList = {
|
|
215 0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163,
|
|
216 0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E,
|
|
217 0x1E6F, 0xFF34, 0xFF54
|
|
218 };
|
|
219 wildCardCharMap.put("T", TList);
|
|
220
|
|
221 //IN: Uu Úú Ùù Ŭŭ Ûû Ǔǔ Ůů Üü Ǘǘ Ǜǜ Ǚǚ Ǖǖ Űű Ũũ Ṹṹ Ųų Ūū
|
|
222 //OUT: Ṻṻ Ủủ Ȕȕ Ȗȗ Ưư Ứứ Ừừ Ữữ Ửử Ựự Ụụ Ṳṳ Ṷṷ Ṵṵ Ʉʉ ᵾ ᶙ ᴜ Uu
|
|
223 Character[] UList ={
|
|
224 0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3,
|
|
225 0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9,
|
|
226 0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79,
|
|
227 0x172, 0x173, 0x16A, 0x16B
|
|
228 };
|
|
229 wildCardCharMap.put("U", UList);
|
|
230
|
|
231 Character[] VList = {
|
|
232 0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2,
|
|
233 0x28B, 0x1D20, 0xFF36, 0xFF56
|
|
234 };
|
|
235 wildCardCharMap.put("V", VList);
|
|
236
|
|
237 //IN: Zz Źź Ẑẑ Žž Żż Ẓẓ Ẕẕ Ƶƶ Ȥȥ
|
|
238 //OUT: Ⱬⱬ ᵶ ᶎ ʐ ʑ ɀ ᴢ Zz
|
|
239 Character[] ZList = {
|
|
240 0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D,
|
|
241 0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94,
|
|
242 0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A
|
|
243 };
|
|
244 wildCardCharMap.put("Z", ZList);
|
|
245 }
|
|
246
|
|
247
|
|
248 public static String normalize(String w){
|
|
249 if(StringUtils.isEmpty(w))
|
|
250 return w;
|
|
251
|
|
252 w = w.toLowerCase();
|
|
253 /*
|
|
254 * Replacing combination of vowels
|
|
255 */
|
|
256 for(String key : wildCardStringMap.keySet()){
|
|
257 List<String> list = wildCardStringMap.get(key);
|
|
258 for(String term : list){
|
|
259 w = w.replace(term, key);
|
|
260 }
|
|
261 }
|
|
262
|
|
263 for(String key : wildCardCharMap.keySet()){
|
|
264 Character[] list = wildCardCharMap.get(key);
|
|
265 for(int i=0; i< list.length; i++){
|
|
266 w = w.replace(list[i] + "", key);
|
|
267 }
|
|
268 }
|
|
269 return w.toLowerCase();
|
|
270 }
|
|
271
|
|
272 public static String normalizedToCompare(String s1){
|
|
273 s1 = s1.replace("#", "");
|
|
274 s1 = s1.replace("-", "");
|
|
275 s1 = s1.replace("(", "");
|
|
276 s1 = s1.replace(")", "");
|
|
277 s1 = s1.replace("[", "");
|
|
278 s1 = s1.replace("]", "");
|
|
279 s1 = s1.replace("_", "");
|
|
280
|
|
281 return s1;
|
|
282 }
|
|
283
|
|
284 public static void main(String[] args){
|
|
285 String s = NormalizerUtils.normalize("ṯ");
|
|
286 System.out.println(s);
|
|
287 }
|
|
288 }
|