annotate src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java @ 1:615d27dce9b3

(none)
author jurzua
date Wed, 29 Oct 2014 13:28:45 +0000
parents
children ac466a164b61
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
jurzua
parents:
diff changeset
1 package org.mpi.openmind.repository.utils;
jurzua
parents:
diff changeset
2
jurzua
parents:
diff changeset
3 import java.util.ArrayList;
jurzua
parents:
diff changeset
4 import java.util.HashMap;
jurzua
parents:
diff changeset
5 import java.util.List;
jurzua
parents:
diff changeset
6 import java.util.Map;
jurzua
parents:
diff changeset
7
jurzua
parents:
diff changeset
8 import org.apache.commons.lang.StringUtils;
jurzua
parents:
diff changeset
9
jurzua
parents:
diff changeset
10 public class NormalizerUtils {
jurzua
parents:
diff changeset
11
jurzua
parents:
diff changeset
12 public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>();
jurzua
parents:
diff changeset
13
jurzua
parents:
diff changeset
14 static{
jurzua
parents:
diff changeset
15 List<String> list;
jurzua
parents:
diff changeset
16 StringBuilder sb = new StringBuilder();
jurzua
parents:
diff changeset
17
jurzua
parents:
diff changeset
18 list = new ArrayList<String>();
jurzua
parents:
diff changeset
19 Character c = 0x1E6F;
jurzua
parents:
diff changeset
20 sb.append(c);
jurzua
parents:
diff changeset
21 list.add(sb.toString());//ṯ
jurzua
parents:
diff changeset
22 list.add("th");
jurzua
parents:
diff changeset
23 wildCardStringMap.put("T", list);
jurzua
parents:
diff changeset
24
jurzua
parents:
diff changeset
25 list = new ArrayList<String>();
jurzua
parents:
diff changeset
26 c = 0x1E2b;
jurzua
parents:
diff changeset
27 list.add(c + "");//ḫ
jurzua
parents:
diff changeset
28 list.add("kh");
jurzua
parents:
diff changeset
29 wildCardStringMap.put("H", list);
jurzua
parents:
diff changeset
30
jurzua
parents:
diff changeset
31 list = new ArrayList<String>();
jurzua
parents:
diff changeset
32 c = 0x1E0f;
jurzua
parents:
diff changeset
33 list.add(c + "");//ḏ
jurzua
parents:
diff changeset
34 list.add("dh");
jurzua
parents:
diff changeset
35 wildCardStringMap.put("D", list);
jurzua
parents:
diff changeset
36
jurzua
parents:
diff changeset
37 list = new ArrayList<String>();
jurzua
parents:
diff changeset
38 c = 0x0161;
jurzua
parents:
diff changeset
39 list.add(c + "");//š
jurzua
parents:
diff changeset
40 list.add("sh");
jurzua
parents:
diff changeset
41 wildCardStringMap.put("S", list);
jurzua
parents:
diff changeset
42
jurzua
parents:
diff changeset
43 list = new ArrayList<String>();
jurzua
parents:
diff changeset
44 c = 0x0121;
jurzua
parents:
diff changeset
45 list.add(c + "");//ġ
jurzua
parents:
diff changeset
46 list.add("gh");
jurzua
parents:
diff changeset
47 wildCardStringMap.put("G", list);
jurzua
parents:
diff changeset
48
jurzua
parents:
diff changeset
49 list = new ArrayList<String>();
jurzua
parents:
diff changeset
50 c = 0x1E97;
jurzua
parents:
diff changeset
51 list.add("a" + c + " ");//aẗSPACE
jurzua
parents:
diff changeset
52 list.add("at ");
jurzua
parents:
diff changeset
53 list.add("ah ");
jurzua
parents:
diff changeset
54 list.add("a ");
jurzua
parents:
diff changeset
55 wildCardStringMap.put("A ", list);
jurzua
parents:
diff changeset
56
jurzua
parents:
diff changeset
57 list = new ArrayList<String>();
jurzua
parents:
diff changeset
58 c = 0x1ef3;
jurzua
parents:
diff changeset
59 list.add(c + "");//ỳ
jurzua
parents:
diff changeset
60 c = 0x00E1;
jurzua
parents:
diff changeset
61 list.add(c + "");//á
jurzua
parents:
diff changeset
62 c = 0x0101;
jurzua
parents:
diff changeset
63 list.add(c + "");//ā
jurzua
parents:
diff changeset
64 c = 0x00E0;
jurzua
parents:
diff changeset
65 list.add(c + "");//à
jurzua
parents:
diff changeset
66 /*
jurzua
parents:
diff changeset
67 //Chantal list for A
jurzua
parents:
diff changeset
68 c = 0x0065;
jurzua
parents:
diff changeset
69 list.add(c + "");//e
jurzua
parents:
diff changeset
70 c = 0x0101;
jurzua
parents:
diff changeset
71 list.add(c + "");//ā
jurzua
parents:
diff changeset
72 c = 0x00E2;
jurzua
parents:
diff changeset
73 list.add(c + "");//â
jurzua
parents:
diff changeset
74 */
jurzua
parents:
diff changeset
75 wildCardStringMap.put("A", list);
jurzua
parents:
diff changeset
76
jurzua
parents:
diff changeset
77 /*
jurzua
parents:
diff changeset
78 list = new ArrayList<String>();
jurzua
parents:
diff changeset
79 c = 0x0062;
jurzua
parents:
diff changeset
80 list.add(c + "");//b
jurzua
parents:
diff changeset
81 c = 0x0070;
jurzua
parents:
diff changeset
82 list.add(c + "");//p
jurzua
parents:
diff changeset
83 wildCardStringMap.put("B", list);
jurzua
parents:
diff changeset
84 */
jurzua
parents:
diff changeset
85 }
jurzua
parents:
diff changeset
86
jurzua
parents:
diff changeset
87 public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>();
jurzua
parents:
diff changeset
88
jurzua
parents:
diff changeset
89 // " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ
jurzua
parents:
diff changeset
90 public static Character[] apostrophes = {
jurzua
parents:
diff changeset
91 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF };
jurzua
parents:
diff changeset
92 //IN: Aa Áá Àà Ââ Ǎǎ Ăă Ãã Ảả Ȧȧ Ạạ Ää Åå Ḁḁ Āā Ąą
jurzua
parents:
diff changeset
93 //OUT: ᶏ Ⱥⱥ Ȁȁ Ấấ Ầầ Ẫẫ Ẩẩ Ậậ Ắắ Ằằ Ẵẵ Ẳẳ Ặặ Ǻǻ Ǡǡ Ǟǟ Ȁȁ Ȃȃ
jurzua
parents:
diff changeset
94 public static Character[] AList = {
jurzua
parents:
diff changeset
95 0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD,
jurzua
parents:
diff changeset
96 0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226,
jurzua
parents:
diff changeset
97 0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00,
jurzua
parents:
diff changeset
98 0x1E01, 0x100, 0x101, 0x104, 0x105 };
jurzua
parents:
diff changeset
99
jurzua
parents:
diff changeset
100 static{
jurzua
parents:
diff changeset
101
jurzua
parents:
diff changeset
102 wildCardCharMap.put("", apostrophes);
jurzua
parents:
diff changeset
103 wildCardCharMap.put("A", AList);
jurzua
parents:
diff changeset
104
jurzua
parents:
diff changeset
105 //IN: Bb Ḃḃ Ḅḅ Ḇḇ Ɓɓ ʙ Bb
jurzua
parents:
diff changeset
106 //OUT: Ƃƃ ᵬ ᶀ ʙ Bb ȸ Ƀƀ
jurzua
parents:
diff changeset
107 Character[] BList = {
jurzua
parents:
diff changeset
108 0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06,
jurzua
parents:
diff changeset
109 0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42,
jurzua
parents:
diff changeset
110 };
jurzua
parents:
diff changeset
111 wildCardCharMap.put("B", BList);
jurzua
parents:
diff changeset
112
jurzua
parents:
diff changeset
113 //Ćć Ĉĉ Čč Ċċ C̄c̄ Ç(ç problem with this) Ḉḉ Ȼȼ Ƈƈ ɕ ᴄ Cc
jurzua
parents:
diff changeset
114 Character[] CList = {
jurzua
parents:
diff changeset
115 0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D,
jurzua
parents:
diff changeset
116 0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B,
jurzua
parents:
diff changeset
117 0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43
jurzua
parents:
diff changeset
118 };
jurzua
parents:
diff changeset
119 wildCardCharMap.put("C", CList);
jurzua
parents:
diff changeset
120
jurzua
parents:
diff changeset
121 //IN: Dd Ďď Ḋḋ Ḑḑ Ḍḍ Ḓḓ Ḏḏ Dd
jurzua
parents:
diff changeset
122 //OUT: Đđ D̦d̦ Ɖɖ Ɗɗ Ƌƌ ᵭ ᶁ ᶑ ȡ ᴅ
jurzua
parents:
diff changeset
123 Character[] DList = {
jurzua
parents:
diff changeset
124 0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10,
jurzua
parents:
diff changeset
125 0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E,
jurzua
parents:
diff changeset
126 0x1E0F, 0xFF24, 0xFF44
jurzua
parents:
diff changeset
127 };
jurzua
parents:
diff changeset
128 wildCardCharMap.put("D", DList);
jurzua
parents:
diff changeset
129
jurzua
parents:
diff changeset
130 //IN: Ee Éé Èè Êê Ḙḙ Ěě Ĕĕ Ẽẽ Ḛḛ Ẻẻ Ėė Ëë Ēē Ȩȩ Ęę Ȅȅ Ếế Ềề Ễễ Ểể Ḝḝ Ḗḗ Ḕḕ Ȇȇ Ẹẹ Ệệ ᴇ Ee
jurzua
parents:
diff changeset
131 //OUT: Ææ Ǽǽ Ǣǣ Œœ ᶒ Ɇɇ
jurzua
parents:
diff changeset
132 Character[] EList = {
jurzua
parents:
diff changeset
133 0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA,
jurzua
parents:
diff changeset
134 0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115,
jurzua
parents:
diff changeset
135 0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB,
jurzua
parents:
diff changeset
136 0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228,
jurzua
parents:
diff changeset
137 0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF,
jurzua
parents:
diff changeset
138 0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3,
jurzua
parents:
diff changeset
139 0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15,
jurzua
parents:
diff changeset
140 0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7,
jurzua
parents:
diff changeset
141 0x1D07, 0xFF25, 0xFF45
jurzua
parents:
diff changeset
142 };
jurzua
parents:
diff changeset
143 wildCardCharMap.put("E", EList);
jurzua
parents:
diff changeset
144
jurzua
parents:
diff changeset
145 //Ii Íí Ìì Ĭĭ Îî Ǐǐ Ïï Ḯḯ Ĩĩ Įį Īī Ỉỉ Ȉȉ Ȋȋ Ịị Ḭḭ
jurzua
parents:
diff changeset
146 Character[] IList = {
jurzua
parents:
diff changeset
147 0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE,
jurzua
parents:
diff changeset
148 0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128,
jurzua
parents:
diff changeset
149 0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208,
jurzua
parents:
diff changeset
150 0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D
jurzua
parents:
diff changeset
151 };
jurzua
parents:
diff changeset
152 wildCardCharMap.put("I", IList);
jurzua
parents:
diff changeset
153
jurzua
parents:
diff changeset
154 //IN: Gg Ǵǵ Ğğ Ĝĝ Ǧǧ Ġġ Ģģ Ḡḡ Ǥǥ Gg
jurzua
parents:
diff changeset
155 //OUT: Ɠɠ ᶃ ɢ
jurzua
parents:
diff changeset
156 Character[] GList = {
jurzua
parents:
diff changeset
157 0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D,
jurzua
parents:
diff changeset
158 0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21,
jurzua
parents:
diff changeset
159 0x1E4, 0x1E5, 0xFF27, 0xFF47
jurzua
parents:
diff changeset
160 };
jurzua
parents:
diff changeset
161 wildCardCharMap.put("G", GList);
jurzua
parents:
diff changeset
162
jurzua
parents:
diff changeset
163 //Nn Ńń Ǹǹ Ňň Ññ Ṅṅ Ņņ Ṇṇ Ṋṋ Ṉṉ
jurzua
parents:
diff changeset
164 Character[] NList = {
jurzua
parents:
diff changeset
165 0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148,
jurzua
parents:
diff changeset
166 0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46,
jurzua
parents:
diff changeset
167 0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49
jurzua
parents:
diff changeset
168 };
jurzua
parents:
diff changeset
169 wildCardCharMap.put("N", NList);
jurzua
parents:
diff changeset
170
jurzua
parents:
diff changeset
171 //H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ
jurzua
parents:
diff changeset
172 Character[] HList = {
jurzua
parents:
diff changeset
173 0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27,
jurzua
parents:
diff changeset
174 0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A,
jurzua
parents:
diff changeset
175 0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68
jurzua
parents:
diff changeset
176 };
jurzua
parents:
diff changeset
177 wildCardCharMap.put("H", HList);
jurzua
parents:
diff changeset
178
jurzua
parents:
diff changeset
179 //Oo Óó Òò Ŏŏ Ôô Ốố Ồồ Ỗỗ Ổổ Ǒǒ Öö Ȫȫ Őő Õõ Ṍṍ Ṏṏ Ȭȭ Ȯȯ Ȱȱ Øø Ǿǿ Ǫǫ Ǭǭ Ōō Ṓṓ Ṑṑ Ỏỏ Ȍȍ Ȏȏ Ơơ Ớớ Ờờ Ỡỡ Ởở Ợợ Ọọ Ộộ
jurzua
parents:
diff changeset
180 Character[] OLIST = {
jurzua
parents:
diff changeset
181 0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4,
jurzua
parents:
diff changeset
182 0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7,
jurzua
parents:
diff changeset
183 0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B,
jurzua
parents:
diff changeset
184 0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F,
jurzua
parents:
diff changeset
185 0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE,
jurzua
parents:
diff changeset
186 0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52,
jurzua
parents:
diff changeset
187 0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D,
jurzua
parents:
diff changeset
188 0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD,
jurzua
parents:
diff changeset
189 0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC,
jurzua
parents:
diff changeset
190 0x1ECD, 0x1ED8, 0x1ED9
jurzua
parents:
diff changeset
191 };
jurzua
parents:
diff changeset
192 wildCardCharMap.put("O", OLIST);
jurzua
parents:
diff changeset
193
jurzua
parents:
diff changeset
194 Character[] RList = {
jurzua
parents:
diff changeset
195 0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59,
jurzua
parents:
diff changeset
196 0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B,
jurzua
parents:
diff changeset
197 0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52
jurzua
parents:
diff changeset
198 };
jurzua
parents:
diff changeset
199 wildCardCharMap.put("R", RList);
jurzua
parents:
diff changeset
200
jurzua
parents:
diff changeset
201
jurzua
parents:
diff changeset
202 //IN: Ss Śś Ṥṥ Ŝŝ Šš Ṧṧ Ṡṡẛ Şş Ṣṣ Ṩṩ Șș S̩̩
jurzua
parents:
diff changeset
203 //OUT: ᵴ ᶊ ʂ ȿ ꜱ Ss s
jurzua
parents:
diff changeset
204 Character[] SList = {
jurzua
parents:
diff changeset
205 0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D,
jurzua
parents:
diff changeset
206 0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F,
jurzua
parents:
diff changeset
207 0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53
jurzua
parents:
diff changeset
208 };
jurzua
parents:
diff changeset
209 wildCardCharMap.put("S", SList);
jurzua
parents:
diff changeset
210
jurzua
parents:
diff changeset
211
jurzua
parents:
diff changeset
212 //IN: Tt Ťť Ṫṫ Ţţ Ṭṭ Țț Ṱṱ Ṯṯ Tt
jurzua
parents:
diff changeset
213 //OUT: Ŧŧ Ⱦⱦ Ƭƭ Ʈʈ T̈ẗ ᵵ ƫ ȶ ᶙ ᴛ
jurzua
parents:
diff changeset
214 Character[] TList = {
jurzua
parents:
diff changeset
215 0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163,
jurzua
parents:
diff changeset
216 0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E,
jurzua
parents:
diff changeset
217 0x1E6F, 0xFF34, 0xFF54
jurzua
parents:
diff changeset
218 };
jurzua
parents:
diff changeset
219 wildCardCharMap.put("T", TList);
jurzua
parents:
diff changeset
220
jurzua
parents:
diff changeset
221 //IN: Uu Úú Ùù Ŭŭ Ûû Ǔǔ Ůů Üü Ǘǘ Ǜǜ Ǚǚ Ǖǖ Űű Ũũ Ṹṹ Ųų Ūū
jurzua
parents:
diff changeset
222 //OUT: Ṻṻ Ủủ Ȕȕ Ȗȗ Ưư Ứứ Ừừ Ữữ Ửử Ựự Ụụ Ṳṳ Ṷṷ Ṵṵ Ʉʉ ᵾ ᶙ ᴜ Uu
jurzua
parents:
diff changeset
223 Character[] UList ={
jurzua
parents:
diff changeset
224 0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3,
jurzua
parents:
diff changeset
225 0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9,
jurzua
parents:
diff changeset
226 0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79,
jurzua
parents:
diff changeset
227 0x172, 0x173, 0x16A, 0x16B
jurzua
parents:
diff changeset
228 };
jurzua
parents:
diff changeset
229 wildCardCharMap.put("U", UList);
jurzua
parents:
diff changeset
230
jurzua
parents:
diff changeset
231 Character[] VList = {
jurzua
parents:
diff changeset
232 0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2,
jurzua
parents:
diff changeset
233 0x28B, 0x1D20, 0xFF36, 0xFF56
jurzua
parents:
diff changeset
234 };
jurzua
parents:
diff changeset
235 wildCardCharMap.put("V", VList);
jurzua
parents:
diff changeset
236
jurzua
parents:
diff changeset
237 //IN: Zz Źź Ẑẑ Žž Żż Ẓẓ Ẕẕ Ƶƶ Ȥȥ
jurzua
parents:
diff changeset
238 //OUT: Ⱬⱬ ᵶ ᶎ ʐ ʑ ɀ ᴢ Zz
jurzua
parents:
diff changeset
239 Character[] ZList = {
jurzua
parents:
diff changeset
240 0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D,
jurzua
parents:
diff changeset
241 0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94,
jurzua
parents:
diff changeset
242 0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A
jurzua
parents:
diff changeset
243 };
jurzua
parents:
diff changeset
244 wildCardCharMap.put("Z", ZList);
jurzua
parents:
diff changeset
245 }
jurzua
parents:
diff changeset
246
jurzua
parents:
diff changeset
247
jurzua
parents:
diff changeset
248 public static String normalize(String w){
jurzua
parents:
diff changeset
249 if(StringUtils.isEmpty(w))
jurzua
parents:
diff changeset
250 return w;
jurzua
parents:
diff changeset
251
jurzua
parents:
diff changeset
252 w = w.toLowerCase();
jurzua
parents:
diff changeset
253 /*
jurzua
parents:
diff changeset
254 * Replacing combination of vowels
jurzua
parents:
diff changeset
255 */
jurzua
parents:
diff changeset
256 for(String key : wildCardStringMap.keySet()){
jurzua
parents:
diff changeset
257 List<String> list = wildCardStringMap.get(key);
jurzua
parents:
diff changeset
258 for(String term : list){
jurzua
parents:
diff changeset
259 w = w.replace(term, key);
jurzua
parents:
diff changeset
260 }
jurzua
parents:
diff changeset
261 }
jurzua
parents:
diff changeset
262
jurzua
parents:
diff changeset
263 for(String key : wildCardCharMap.keySet()){
jurzua
parents:
diff changeset
264 Character[] list = wildCardCharMap.get(key);
jurzua
parents:
diff changeset
265 for(int i=0; i< list.length; i++){
jurzua
parents:
diff changeset
266 w = w.replace(list[i] + "", key);
jurzua
parents:
diff changeset
267 }
jurzua
parents:
diff changeset
268 }
jurzua
parents:
diff changeset
269 return w.toLowerCase();
jurzua
parents:
diff changeset
270 }
jurzua
parents:
diff changeset
271
jurzua
parents:
diff changeset
272 public static String normalizedToCompare(String s1){
jurzua
parents:
diff changeset
273 s1 = s1.replace("#", "");
jurzua
parents:
diff changeset
274 s1 = s1.replace("-", "");
jurzua
parents:
diff changeset
275 s1 = s1.replace("(", "");
jurzua
parents:
diff changeset
276 s1 = s1.replace(")", "");
jurzua
parents:
diff changeset
277 s1 = s1.replace("[", "");
jurzua
parents:
diff changeset
278 s1 = s1.replace("]", "");
jurzua
parents:
diff changeset
279 s1 = s1.replace("_", "");
jurzua
parents:
diff changeset
280
jurzua
parents:
diff changeset
281 return s1;
jurzua
parents:
diff changeset
282 }
jurzua
parents:
diff changeset
283
jurzua
parents:
diff changeset
284 public static void main(String[] args){
jurzua
parents:
diff changeset
285 String s = NormalizerUtils.normalize("ṯ");
jurzua
parents:
diff changeset
286 System.out.println(s);
jurzua
parents:
diff changeset
287 }
jurzua
parents:
diff changeset
288 }