Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.StringReader; | |
5 import java.util.ArrayList; | |
6 | |
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; | |
10 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEL; | |
12 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexEN; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; | |
15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; | |
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; | |
17 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; | |
18 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.Regularization; | |
19 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.RegularizationManager; | |
20 | |
21 public class Normalizer { | |
22 public static int DISPLAY = 1; // normalization in DISPLAY mode | |
23 public static int DICTIONARY = 2; // normalization in DICTIONARY mode | |
24 public static int SEARCH = 3; // normalization in SEARCH mode; never used so far in indexing because it does not support the morph. lexicons such as CELEX (e.g. eingeschränkt would not be stemmed to eingeschraenkt) | |
25 private int normMode = DISPLAY; // Default e.g. for indexing and querying | |
26 private String[] normFunctions = {"norm"}; // default is to use the norm function | |
27 private String language; | |
28 private int[] offsets; | |
29 | |
30 public Normalizer(String[] normFunctions, String lang) { | |
31 this.normFunctions = normFunctions; | |
32 String language = Language.getInstance().getLanguageId(lang); | |
33 this.language = language; | |
34 } | |
35 | |
36 public Normalizer(String language) { | |
37 this.language = language; | |
38 } | |
39 | |
40 public String getLanguage() { | |
41 return language; | |
42 } | |
43 | |
44 public void setNormMode(int normMode) { | |
45 this.normMode = normMode; | |
46 } | |
47 | |
48 /** | |
49 * Applies the normalization rules in <code>language</code> to | |
50 * <code>s</code>, without offset tracking. | |
51 * | |
52 * @param s source string | |
53 * @return normalized string | |
54 */ | |
55 public String normalize(String s) throws ApplicationException { | |
56 String normStr = s; | |
57 if (useSpecialNormFunction()) | |
58 normStr = removeSpecialNWDMarks(normStr); | |
59 if (useRegFunction()) { | |
60 // try to regularize the string to the norm form over predefined regularizations | |
61 RegularizationManager regManager = RegularizationManager.getInstance(); | |
62 ArrayList<Regularization> regs = regManager.findRegsByOrig(language, s); | |
63 if (regs != null && regs.size() > 0) { | |
64 Regularization reg = regs.get(0); // only one: the first one | |
65 String regNormStr = reg.getNorm(); | |
66 normStr = regNormStr; | |
67 } | |
68 } | |
69 if (useNormFunction()) { | |
70 // normalize the string by string replacements | |
71 if (normMode == DICTIONARY) { | |
72 normStr = normalize(normStr, DICTIONARY); | |
73 } else if (normMode == DISPLAY) { | |
74 normStr = normalize(normStr, DISPLAY); | |
75 } else if (normMode == SEARCH) { | |
76 normStr = normalize(normStr, SEARCH); | |
77 } | |
78 } | |
79 if (useSpecialNormFunction()) | |
80 normStr = insertSpecialNWDMarks(normStr); | |
81 return normStr; | |
82 } | |
83 | |
84 private boolean useRegFunction() { | |
85 boolean useReg = false; | |
86 for (int i=0; i< normFunctions.length; i++) { | |
87 String function = normFunctions[i]; | |
88 if (function.equals("reg")) | |
89 return true; | |
90 } | |
91 return useReg; | |
92 } | |
93 | |
94 private boolean useNormFunction() { | |
95 boolean useNorm = false; | |
96 for (int i=0; i< normFunctions.length; i++) { | |
97 String function = normFunctions[i]; | |
98 if (function.equals("norm") || function.equals("specialNorm")) | |
99 return true; | |
100 } | |
101 return useNorm; | |
102 } | |
103 | |
104 private boolean useSpecialNormFunction() { | |
105 boolean useNorm = false; | |
106 for (int i=0; i< normFunctions.length; i++) { | |
107 String function = normFunctions[i]; | |
108 if (function.equals("specialNorm")) | |
109 return true; | |
110 } | |
111 return useNorm; | |
112 } | |
113 | |
114 private String normalize(String s, int mode) { | |
115 String inputStr = s; | |
116 StringReader strReader = new StringReader(inputStr + "\n"); | |
117 String retStr = ""; | |
118 String token = ""; | |
119 try { | |
120 if (Language.getInstance().isLatin(language)) { | |
121 MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader); | |
122 if (mode == DISPLAY) | |
123 mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP); | |
124 else if (mode == DICTIONARY) | |
125 mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DICT); | |
126 else if (mode == SEARCH) | |
127 mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.SEARCH); | |
128 while (token != null) { | |
129 token = mpdlNormalizerLex.yylex(); | |
130 if (token != null) | |
131 retStr += token; | |
132 } | |
133 } else if (Language.getInstance().isArabic(language)) { | |
134 MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader); | |
135 if (mode == DISPLAY) | |
136 mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP); | |
137 else if (mode == DICTIONARY) | |
138 mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DICT); | |
139 else if (mode == SEARCH) | |
140 mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.SEARCH); | |
141 while (token != null) { | |
142 token = mpdlNormalizerLex.yylex(); | |
143 if (token != null) | |
144 retStr += token; | |
145 } | |
146 } else if (Language.getInstance().isGerman(language)) { | |
147 MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader); | |
148 if (mode == DISPLAY) | |
149 mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP); | |
150 else if (mode == DICTIONARY) | |
151 mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.CELEX); | |
152 else if (mode == SEARCH) | |
153 mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.SEARCH); | |
154 while (token != null) { | |
155 token = mpdlNormalizerLex.yylex(); | |
156 if (token != null) | |
157 retStr += token; | |
158 } | |
159 } else if (Language.getInstance().isGreek(language)) { | |
160 MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader); | |
161 if (mode == DISPLAY) | |
162 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP); | |
163 else if (mode == DICTIONARY) | |
164 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SIGMA); | |
165 else if (mode == SEARCH) | |
166 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SEARCH); | |
167 while (token != null) { | |
168 token = mpdlNormalizerLex.yylex(); | |
169 if (token != null) | |
170 retStr += token; | |
171 } | |
172 } else if (Language.getInstance().isEnglish(language)) { | |
173 MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader); | |
174 if (mode == DISPLAY) | |
175 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP); | |
176 else if (mode == DICTIONARY) | |
177 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DICT); | |
178 else if (mode == SEARCH) | |
179 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.SEARCH); | |
180 while (token != null) { | |
181 token = mpdlNormalizerLex.yylex(); | |
182 if (token != null) | |
183 retStr += token; | |
184 } | |
185 } else if (Language.getInstance().isFrench(language)) { | |
186 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); | |
187 if (mode == DISPLAY) | |
188 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); | |
189 else if (mode == DICTIONARY) | |
190 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); | |
191 else if (mode == SEARCH) | |
192 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); | |
193 while (token != null) { | |
194 token = mpdlNormalizerLex.yylex(); | |
195 if (token != null) | |
196 retStr += token; | |
197 } | |
198 } else if (Language.getInstance().isItalian(language)) { | |
199 MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader); | |
200 if (mode == DISPLAY) | |
201 mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP); | |
202 else if (mode == DICTIONARY) | |
203 mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DICT); | |
204 else if (mode == SEARCH) | |
205 mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.SEARCH); | |
206 while (token != null) { | |
207 token = mpdlNormalizerLex.yylex(); | |
208 if (token != null) | |
209 retStr += token; | |
210 } | |
211 } else if (Language.getInstance().isDutch(language)) { | |
212 MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader); | |
213 if (mode == DISPLAY) | |
214 mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP); | |
215 else if (mode == DICTIONARY) | |
216 mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DICT); | |
217 else if (mode == SEARCH) | |
218 mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.SEARCH); | |
219 while (token != null) { | |
220 token = mpdlNormalizerLex.yylex(); | |
221 if (token != null) | |
222 retStr += token; | |
223 } | |
224 } else if (Language.getInstance().isChinese(language)) { | |
225 MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader); | |
226 if (mode == DISPLAY) | |
227 mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP); | |
228 else if (mode == DICTIONARY) | |
229 mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DICT); | |
230 else if (mode == SEARCH) | |
231 mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.SEARCH); | |
232 while (token != null) { | |
233 token = mpdlNormalizerLex.yylex(); | |
234 if (token != null) | |
235 retStr += token; | |
236 } | |
237 } else { | |
238 retStr = s; // return the string unchanged | |
239 } | |
240 } catch (IOException e ) { | |
241 // nothing cause IOException is not needed for a StringReader | |
242 } | |
243 return retStr; | |
244 } | |
245 | |
246 | |
247 // used only in XmlTokenizerContentHandler // TODO make it better | |
248 private String removeSpecialNWDMarks(String inputString) { | |
249 String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | |
250 String cleanedWord = inputString; | |
251 boolean startsWithNWDMark = cleanedWord.startsWith(COMPLEX_ELEMENT_NWD_MARK); | |
252 if (startsWithNWDMark) | |
253 cleanedWord = cleanedWord.substring(1); | |
254 int countNWDMarks = cleanedWord.length() - cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); | |
255 if (countNWDMarks > 1) | |
256 cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK + "+", COMPLEX_ELEMENT_NWD_MARK); | |
257 // boolean notHyphenPlusNWD = cleanedWord.matches(".*[^-]+" + COMPLEX_ELEMENT_NWD_MARK + "+.*"); // e.g. "praebi ta" | |
258 // if (notHyphenPlusNWD) | |
259 // cleanedWord = cleanedWord.replaceAll("([^-]+)" + COMPLEX_ELEMENT_NWD_MARK + "+", "$1-" + COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi ta" is replaced by "praebi- ta" | |
260 cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, " "); | |
261 return cleanedWord; | |
262 } | |
263 | |
264 private String insertSpecialNWDMarks(String inputString) { | |
265 String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | |
266 String retStr = inputString; | |
267 boolean startsWithNWDMark = retStr.startsWith(COMPLEX_ELEMENT_NWD_MARK); | |
268 int countNWDMarks = retStr.length() - retStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length(); | |
269 retStr = retStr.replaceAll(" ", COMPLEX_ELEMENT_NWD_MARK); | |
270 // if (notHyphenPlusNWD) | |
271 // normalizedWordStr = normalizedWordStr.replaceAll("-" + COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_NWD_MARK); // e.g. "praebi- ta" is replaced by "praebi ta" | |
272 if (countNWDMarks > 1) { | |
273 String nwdStr = ""; | |
274 for (int i=0; i<countNWDMarks; i++) | |
275 nwdStr += COMPLEX_ELEMENT_NWD_MARK; | |
276 retStr = retStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, nwdStr); | |
277 } | |
278 if (startsWithNWDMark) | |
279 retStr = COMPLEX_ELEMENT_NWD_MARK + retStr; | |
280 return retStr; | |
281 } | |
282 | |
283 /** | |
284 * Old code from Arboreal (Malcolm Hyman) | |
285 * Applies the normalization rules in <code>language</code> to | |
286 * <code>s</code>, with offset tracking.<p> | |
287 * | |
288 * <strong>WARNING:</strong> | |
289 * Arboreal will not work properly if a normalization substitution | |
290 * replaces a source character with more than two target characters! | |
291 * This is simply a BUG, and should be fixed. Fortunately, however, | |
292 * one does not often need such a replacement.<p> | |
293 * | |
294 * @param s source string | |
295 * @param offsets character offset table | |
296 * @return normalized string | |
297 */ | |
298 private String normalize4Lexica(String s, int[] offsets) { | |
299 this.offsets = offsets; | |
300 if (language.equals("la") || language.equals("lat")) { | |
301 StringBuffer buf = new StringBuffer(); | |
302 int n = 0; | |
303 for (int i = 0; i < s.length(); i++) { | |
304 char c = s.charAt(i); | |
305 String replace = new String(); | |
306 switch (c) { | |
307 case 'j': replace = "i"; break; | |
308 case 'v': replace = "u"; break; | |
309 /* | |
310 * Linguistic note: /u/ and /v/ are rarely phonemic | |
311 * in Latin, as in alui 's/he nourished' vs. | |
312 * alvi 'of a belly', volui 's/he wished' or 'it rolled' | |
313 * vs. volvi 'to be rolled', (in)seruit 's/he joined | |
314 * together' vs. (in)servit 's/he serves'. | |
315 */ | |
316 case 'q': | |
317 if ((i < s.length() - 1) && (s.charAt(i + 1) == ';')) | |
318 replace = "qu"; | |
319 else | |
320 replace = "q"; | |
321 break; | |
322 case ';': | |
323 if ((i > 0) && (s.charAt(i - 1) == 'q')) | |
324 replace = "e"; | |
325 else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1))) | |
326 replace = ";"; | |
327 else | |
328 replace = ""; | |
329 break; | |
330 case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT | |
331 case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT | |
332 case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT | |
333 | |
334 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
335 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
336 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
337 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
338 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
339 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
340 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
341 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
342 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
343 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
344 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
345 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
346 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
347 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
348 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
349 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
350 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
351 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
352 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
353 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
354 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
355 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
356 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
357 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
358 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
359 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
360 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
361 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
362 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
363 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
364 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
365 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
366 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
367 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
368 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
369 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
370 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
371 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
372 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
373 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
374 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
375 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
376 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
377 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
378 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
379 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
380 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
381 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
382 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
383 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
384 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
385 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
386 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
387 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
388 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
389 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
390 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
391 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
392 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
393 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
394 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
395 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
396 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
397 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
398 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
399 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
400 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
401 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
402 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
403 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
404 case '\u00ad': break; // soft hyphen | |
405 // new in MPDL project by J. Willenborg | |
406 case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
407 case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
408 case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
409 case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
410 case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
411 case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
412 case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
413 case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
414 case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
415 case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
416 case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
417 case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
418 case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
419 case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
420 case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
421 case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
422 case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
423 case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
424 case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
425 case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
426 case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
427 case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
428 case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
429 case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
430 case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
431 case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
432 // by Malcolm | |
433 case '\u2329': break; // BRA | |
434 case '\u232a': break; // KET | |
435 default: replace += c; break; | |
436 } | |
437 buf.append(replace); | |
438 // update offsets if replacement is a different length | |
439 if (offsets != null) { | |
440 int r = replace.length(); | |
441 if (r == 0) | |
442 this.offsets = arrayKill(this.offsets, i - n); | |
443 else if (r == 2) | |
444 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
445 n += 1 - r; | |
446 } | |
447 } | |
448 return buf.toString(); | |
449 } else if (language.equals("it")) { | |
450 // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported | |
451 StringBuffer buf = new StringBuffer(); | |
452 int n = 0; | |
453 for (int i = 0; i < s.length(); i++) { | |
454 char c = s.charAt(i); | |
455 String replace = new String(); | |
456 switch (c) { | |
457 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
458 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
459 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
460 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
461 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
462 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
463 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
464 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
465 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
466 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
467 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
468 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
469 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
470 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
471 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
472 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
473 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
474 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
475 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
476 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
477 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
478 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
479 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
480 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
481 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
482 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
483 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
484 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
485 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
486 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
487 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
488 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
489 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
490 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
491 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
492 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
493 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
494 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
495 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
496 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
497 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
498 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
499 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
500 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
501 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
502 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
503 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
504 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
505 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
506 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
507 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
508 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
509 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
510 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
511 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
512 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
513 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
514 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
515 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
516 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
517 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
518 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
519 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
520 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
521 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
522 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
523 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
524 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
525 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
526 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
527 // new in MPDL project by J. Willenborg | |
528 case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE | |
529 default: replace += c; break; | |
530 } | |
531 buf.append(replace); | |
532 // update offsets if replacement is a different length | |
533 if (offsets != null) { | |
534 int r = replace.length(); | |
535 if (r == 0) this.offsets = arrayKill(this.offsets, i - n); | |
536 else if (r == 2) | |
537 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
538 n += 1 - r; | |
539 } | |
540 } | |
541 return buf.toString(); | |
542 // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english | |
543 } else if (language.equals("en")) { | |
544 StringBuffer buf = new StringBuffer(); | |
545 int n = 0; | |
546 for (int i = 0; i < s.length(); i++) { | |
547 char c = s.charAt(i); | |
548 String replace = new String(); | |
549 switch (c) { | |
550 case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT | |
551 case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT | |
552 case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT | |
553 | |
554 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
555 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
556 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
557 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
558 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
559 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
560 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
561 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
562 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
563 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
564 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
565 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
566 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
567 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
568 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
569 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
570 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
571 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
572 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
573 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
574 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
575 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
576 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
577 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
578 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
579 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
580 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
581 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
582 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
583 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
584 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
585 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
586 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
587 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
588 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
589 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
590 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
591 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
592 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
593 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
594 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
595 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
596 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
597 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
598 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
599 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
600 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
601 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
602 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
603 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
604 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
605 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
606 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
607 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
608 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
609 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
610 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
611 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
612 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
613 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
614 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
615 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
616 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
617 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
618 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
619 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
620 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
621 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
622 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
623 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
624 // new in MPDL project by J. Willenborg | |
625 case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE | |
626 // by Malcolm | |
627 case '\u00ad': break; // soft hyphen | |
628 case '\u2329': break; // BRA | |
629 case '\u232a': break; // KET | |
630 default: replace += c; break; | |
631 } | |
632 buf.append(replace); | |
633 // update offsets if replacement is a different length | |
634 if (offsets != null) { | |
635 int r = replace.length(); | |
636 if (r == 0) | |
637 this.offsets = arrayKill(this.offsets, i - n); | |
638 else if (r == 2) | |
639 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
640 n += 1 - r; | |
641 } | |
642 } | |
643 return buf.toString(); | |
644 } else if (language.equals("fr")) { | |
645 // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away | |
646 StringBuffer buf = new StringBuffer(); | |
647 int n = 0; | |
648 for (int i = 0; i < s.length(); i++) { | |
649 char c = s.charAt(i); | |
650 String replace = new String(); | |
651 switch (c) { | |
652 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
653 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
654 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
655 case '\u00ad': break; // soft hyphen | |
656 case '-': break; | |
657 default: replace += c; break; | |
658 } | |
659 buf.append(replace); | |
660 // update offsets if replacement is a different length | |
661 if (offsets != null) { | |
662 int r = replace.length(); | |
663 if (r == 0) | |
664 this.offsets = arrayKill(this.offsets, i - n); | |
665 else if (r == 2) | |
666 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
667 n += 1 - r; | |
668 } | |
669 } | |
670 return buf.toString(); | |
671 } else if (language.equals("de")) { | |
672 StringBuffer buf = new StringBuffer(); | |
673 int n = 0; | |
674 for (int i = 0; i < s.length(); i++) { | |
675 char c = s.charAt(i); | |
676 String replace = new String(); | |
677 switch (c) { | |
678 case '\u00c4': replace = "Ae"; break; | |
679 case '\u00d6': replace = "Oe"; break; | |
680 case '\u00dc': replace = "Ue"; break; | |
681 case '\u00df': replace = "ss"; break; | |
682 case '\u00e4': replace = "ae"; break; | |
683 case '\u00f6': replace = "oe"; break; | |
684 case '\u00fc': replace = "ue"; break; | |
685 case '\u00ad': break; // soft hyphen | |
686 case '\u00e9': replace = "e"; break; | |
687 // new in MPDL project by J. Willenborg | |
688 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
689 // case '-': break; | |
690 default: replace += c; break; | |
691 } | |
692 buf.append(replace); | |
693 // update offsets if replacement is a different length | |
694 if (offsets != null) { | |
695 int r = replace.length(); | |
696 if (r == 0) | |
697 this.offsets = arrayKill(this.offsets, i - n); | |
698 else if (r == 2) | |
699 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
700 n += 1 - r; | |
701 } | |
702 } | |
703 return buf.toString(); | |
704 } else if (language.equals("zh")) { | |
705 StringBuffer buf = new StringBuffer(); | |
706 int n = 0; | |
707 for (int i = 0; i < s.length(); i++) { | |
708 char c = s.charAt(i); | |
709 String replace = new String(); | |
710 switch (c) { | |
711 case '\u00b9': replace = "1"; break; | |
712 case '\u00b2': replace = "2"; break; | |
713 case '\u00b3': replace = "3"; break; | |
714 case '\u2074': replace = "4"; break; | |
715 case '\u2075': replace = "5"; break; | |
716 // original by Malcolm Hyman: with the following replacements | |
717 // case '\u3000': replace = " "; break; | |
718 // case '\u3001': replace = ","; break; | |
719 // case '\u3002': replace = "."; break; | |
720 // case '\u200b': break; // BREAKS EVERYTHING! | |
721 default: replace += c; break; | |
722 } | |
723 buf.append(replace); | |
724 // update offsets if replacement is a different length | |
725 if (offsets != null) { | |
726 int r = replace.length(); | |
727 if (r == 0) | |
728 this.offsets = arrayKill(this.offsets, i - n); | |
729 else if (r == 2) | |
730 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
731 n += 1 - r; | |
732 } | |
733 } | |
734 return buf.toString(); | |
735 } else if (language.equals("akk") || | |
736 language.equals("qam") || | |
737 language.equals("qpc") || | |
738 language.equals("elx") || | |
739 language.equals("sux") || | |
740 language.equals("hit") || | |
741 language.equals("qhu") || | |
742 language.equals("peo") || | |
743 language.equals("uga") || | |
744 language.equals("ura") || | |
745 language.equals("qcu")) { | |
746 StringBuffer buf = new StringBuffer(); | |
747 int n = 0; | |
748 char last = '\u0000'; | |
749 for (int i = 0; i < s.length(); i++) { | |
750 char c = s.charAt(i); | |
751 c = Character.toLowerCase(c); | |
752 String replace = new String(); | |
753 switch (c) { | |
754 case '{': replace += "-"; break; | |
755 case '}': replace += "-"; break; | |
756 // These are from PSD::ATF::Unicode by Steve Tinney | |
757 case '\u0161': replace += "sz"; break; | |
758 case '\u1e63': replace += "s,"; break; | |
759 case '\u1e6d': replace += "t,"; break; | |
760 case '\u014b': replace += "j"; break; | |
761 case '\u015b': replace += "s'"; break; | |
762 case '\u2080': replace += "0"; break; | |
763 case '\u2081': replace += "1"; break; | |
764 case '\u2082': replace += "2"; break; | |
765 case '\u2083': replace += "3"; break; | |
766 case '\u2084': replace += "4"; break; | |
767 case '\u2085': replace += "5"; break; | |
768 case '\u2086': replace += "6"; break; | |
769 case '\u2087': replace += "7"; break; | |
770 case '\u2088': replace += "8"; break; | |
771 case '\u2089': replace += "9"; break; | |
772 | |
773 case 'c': // shin (except where used as modifier) | |
774 if ((i > 0) && ((last == '~') || (last == '@'))) | |
775 replace += "c"; | |
776 else replace += "sz"; | |
777 break; | |
778 default: replace += c; break; | |
779 } | |
780 // suppress grapheme boundary before or after word boundary | |
781 if (replace.equals("-")) { | |
782 if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' ')) | |
783 replace = ""; | |
784 } | |
785 last = c; | |
786 buf.append(replace); | |
787 // update offsets if replacement is a different length | |
788 if (offsets != null) { | |
789 int r = replace.length(); | |
790 if (r == 0) | |
791 this.offsets = arrayKill(this.offsets, i - n); | |
792 else if (r == 2) | |
793 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
794 n += 1 - r; | |
795 } | |
796 } | |
797 return buf.toString(); | |
798 } else if (language.equals("el") || language.equals("grc")) { | |
799 StringBuffer buf = new StringBuffer(); | |
800 int n = 0; | |
801 for (int i = 0; i < s.length(); i++) { | |
802 char c = s.charAt(i); | |
803 String replace = new String(); | |
804 switch (c) { | |
805 case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA | |
806 case '<': break; | |
807 case '>': break; | |
808 case '[': break; | |
809 case ']': break; | |
810 case '1': break; | |
811 case '2': break; | |
812 case '\u03ac': replace = "\u1f71"; break; | |
813 case '\u03ad': replace = "\u1f73"; break; | |
814 case '\u03ae': replace = "\u1f75"; break; | |
815 case '\u03af': replace = "\u1f77"; break; | |
816 case '\u03cc': replace = "\u1f79"; break; | |
817 case '\u03cd': replace = "\u1f7b"; break; | |
818 case '\u03ce': replace = "\u1f7d"; break; | |
819 case '-': break; // same treatment as soft hyphen | |
820 case '\u00ad': break; // soft hyphen | |
821 default: replace += c; break; | |
822 } | |
823 buf.append(replace); | |
824 // update offsets if replacement is a different length | |
825 if (offsets != null) { | |
826 int r = replace.length(); | |
827 if (r == 0) | |
828 this.offsets = arrayKill(this.offsets, i - n); | |
829 else if (r == 2) | |
830 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
831 n += 1 - r; | |
832 } | |
833 } | |
834 return buf.toString(); | |
835 } else if (language.equals("el_atonic")) { | |
836 StringBuffer buf = new StringBuffer(); | |
837 int n = 0; | |
838 for (int i = 0; i < s.length(); i++) { | |
839 char c = s.charAt(i); | |
840 String replace = new String(); | |
841 switch (c) { | |
842 case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA | |
843 // map characters with diacritics to their plain equivalent | |
844 // cf. <code>BetaCode.java</code> | |
845 case '\u03aa': replace = "\u0399"; break; | |
846 case '\u03ab': replace = "\u03a5"; break; | |
847 case '\u03ac': replace = "\u0381"; break; | |
848 case '\u03ad': replace = "\u0385"; break; | |
849 case '\u03ae': replace = "\u0387"; break; | |
850 case '\u03af': replace = "\u0389"; break; | |
851 case '\u03ca': replace = "\u03b9"; break; | |
852 case '\u03cb': replace = "\u03c5"; break; | |
853 case '\u03cc': replace = "\u03bf"; break; | |
854 case '\u03cd': replace = "\u03c5"; break; | |
855 case '\u03ce': replace = "\u03c9"; break; | |
856 case '\u1f00': replace = "\u03b1"; break; | |
857 case '\u1f01': replace = "\u03b1"; break; | |
858 case '\u1f02': replace = "\u03b1"; break; | |
859 case '\u1f03': replace = "\u03b1"; break; | |
860 case '\u1f04': replace = "\u03b1"; break; | |
861 case '\u1f05': replace = "\u03b1"; break; | |
862 case '\u1f06': replace = "\u03b1"; break; | |
863 case '\u1f07': replace = "\u03b1"; break; | |
864 case '\u1f08': replace = "\u0391"; break; | |
865 case '\u1f09': replace = "\u0391"; break; | |
866 case '\u1f0a': replace = "\u0391"; break; | |
867 case '\u1f0b': replace = "\u0391"; break; | |
868 case '\u1f0c': replace = "\u0391"; break; | |
869 case '\u1f0d': replace = "\u0391"; break; | |
870 case '\u1f0e': replace = "\u0391"; break; | |
871 case '\u1f0f': replace = "\u0391"; break; | |
872 case '\u1f10': replace = "\u03b5"; break; | |
873 case '\u1f11': replace = "\u03b5"; break; | |
874 case '\u1f12': replace = "\u03b5"; break; | |
875 case '\u1f13': replace = "\u03b5"; break; | |
876 case '\u1f14': replace = "\u03b5"; break; | |
877 case '\u1f15': replace = "\u03b5"; break; | |
878 case '\u1f18': replace = "\u0395"; break; | |
879 case '\u1f19': replace = "\u0395"; break; | |
880 case '\u1f1a': replace = "\u0395"; break; | |
881 case '\u1f1b': replace = "\u0395"; break; | |
882 case '\u1f1c': replace = "\u0395"; break; | |
883 case '\u1f1d': replace = "\u0395"; break; | |
884 case '\u1f20': replace = "\u03b7"; break; | |
885 case '\u1f21': replace = "\u03b7"; break; | |
886 case '\u1f22': replace = "\u03b7"; break; | |
887 case '\u1f23': replace = "\u03b7"; break; | |
888 case '\u1f24': replace = "\u03b7"; break; | |
889 case '\u1f25': replace = "\u03b7"; break; | |
890 case '\u1f26': replace = "\u03b7"; break; | |
891 case '\u1f27': replace = "\u03b7"; break; | |
892 case '\u1f28': replace = "\u0397"; break; | |
893 case '\u1f29': replace = "\u0397"; break; | |
894 case '\u1f2a': replace = "\u0397"; break; | |
895 case '\u1f2b': replace = "\u0397"; break; | |
896 case '\u1f2c': replace = "\u0397"; break; | |
897 case '\u1f2d': replace = "\u0397"; break; | |
898 case '\u1f2e': replace = "\u0397"; break; | |
899 case '\u1f2f': replace = "\u0397"; break; | |
900 case '\u1f30': replace = "\u03b9"; break; | |
901 case '\u1f31': replace = "\u03b9"; break; | |
902 case '\u1f32': replace = "\u03b9"; break; | |
903 case '\u1f33': replace = "\u03b9"; break; | |
904 case '\u1f34': replace = "\u03b9"; break; | |
905 case '\u1f35': replace = "\u03b9"; break; | |
906 case '\u1f36': replace = "\u03b9"; break; | |
907 case '\u1f37': replace = "\u03b9"; break; | |
908 case '\u1f38': replace = "\u0399"; break; | |
909 case '\u1f39': replace = "\u0399"; break; | |
910 case '\u1f3a': replace = "\u0399"; break; | |
911 case '\u1f3b': replace = "\u0399"; break; | |
912 case '\u1f3c': replace = "\u0399"; break; | |
913 case '\u1f3d': replace = "\u0399"; break; | |
914 case '\u1f3e': replace = "\u0399"; break; | |
915 case '\u1f3f': replace = "\u0399"; break; | |
916 case '\u1f40': replace = "\u03bf"; break; | |
917 case '\u1f41': replace = "\u03bf"; break; | |
918 case '\u1f42': replace = "\u03bf"; break; | |
919 case '\u1f43': replace = "\u03bf"; break; | |
920 case '\u1f44': replace = "\u03bf"; break; | |
921 case '\u1f45': replace = "\u03bf"; break; | |
922 case '\u1f48': replace = "\u039f"; break; | |
923 case '\u1f49': replace = "\u039f"; break; | |
924 case '\u1f4a': replace = "\u039f"; break; | |
925 case '\u1f4b': replace = "\u039f"; break; | |
926 case '\u1f4c': replace = "\u039f"; break; | |
927 case '\u1f4d': replace = "\u039f"; break; | |
928 case '\u1f50': replace = "\u03c5"; break; | |
929 case '\u1f51': replace = "\u03c5"; break; | |
930 case '\u1f52': replace = "\u03c5"; break; | |
931 case '\u1f53': replace = "\u03c5"; break; | |
932 case '\u1f54': replace = "\u03c5"; break; | |
933 case '\u1f55': replace = "\u03c5"; break; | |
934 case '\u1f56': replace = "\u03c5"; break; | |
935 case '\u1f57': replace = "\u03c5"; break; | |
936 case '\u1f58': replace = "\u03a5"; break; | |
937 case '\u1f59': replace = "\u03a5"; break; | |
938 case '\u1f5a': replace = "\u03a5"; break; | |
939 case '\u1f5b': replace = "\u03a5"; break; | |
940 case '\u1f5c': replace = "\u03a5"; break; | |
941 case '\u1f5d': replace = "\u03a5"; break; | |
942 case '\u1f5e': replace = "\u03a5"; break; | |
943 case '\u1f5f': replace = "\u03a5"; break; | |
944 case '\u1f60': replace = "\u03c9"; break; | |
945 case '\u1f61': replace = "\u03c9"; break; | |
946 case '\u1f62': replace = "\u03c9"; break; | |
947 case '\u1f63': replace = "\u03c9"; break; | |
948 case '\u1f64': replace = "\u03c9"; break; | |
949 case '\u1f65': replace = "\u03c9"; break; | |
950 case '\u1f66': replace = "\u03c9"; break; | |
951 case '\u1f67': replace = "\u03c9"; break; | |
952 case '\u1f68': replace = "\u03a9"; break; | |
953 case '\u1f69': replace = "\u03a9"; break; | |
954 case '\u1f6a': replace = "\u03a9"; break; | |
955 case '\u1f6b': replace = "\u03a9"; break; | |
956 case '\u1f6c': replace = "\u03a9"; break; | |
957 case '\u1f6d': replace = "\u03a9"; break; | |
958 case '\u1f6e': replace = "\u03a9"; break; | |
959 case '\u1f6f': replace = "\u03a9"; break; | |
960 case '\u1f70': replace = "\u03b1"; break; | |
961 case '\u1f71': replace = "\u03b1"; break; | |
962 case '\u1f72': replace = "\u03b5"; break; | |
963 case '\u1f73': replace = "\u03b5"; break; | |
964 case '\u1f74': replace = "\u03b7"; break; | |
965 case '\u1f75': replace = "\u03b7"; break; | |
966 case '\u1f76': replace = "\u03b9"; break; | |
967 case '\u1f77': replace = "\u03b9"; break; | |
968 case '\u1f78': replace = "\u03bf"; break; | |
969 case '\u1f79': replace = "\u03bf"; break; | |
970 case '\u1f7a': replace = "\u03c5"; break; | |
971 case '\u1f7b': replace = "\u03c5"; break; | |
972 case '\u1f7c': replace = "\u03c9"; break; | |
973 case '\u1f7d': replace = "\u03c9"; break; | |
974 case '\u1f80': replace = "\u03b1"; break; | |
975 case '\u1f81': replace = "\u03b1"; break; | |
976 case '\u1f82': replace = "\u03b1"; break; | |
977 case '\u1f83': replace = "\u03b1"; break; | |
978 case '\u1f84': replace = "\u03b1"; break; | |
979 case '\u1f85': replace = "\u03b1"; break; | |
980 case '\u1f86': replace = "\u03b1"; break; | |
981 case '\u1f87': replace = "\u03b1"; break; | |
982 case '\u1f88': replace = "\u0391"; break; | |
983 case '\u1f89': replace = "\u0391"; break; | |
984 case '\u1f8a': replace = "\u0391"; break; | |
985 case '\u1f8b': replace = "\u0391"; break; | |
986 case '\u1f8c': replace = "\u0391"; break; | |
987 case '\u1f8d': replace = "\u0391"; break; | |
988 case '\u1f8e': replace = "\u0391"; break; | |
989 case '\u1f8f': replace = "\u0391"; break; | |
990 case '\u1f90': replace = "\u03b7"; break; | |
991 case '\u1f91': replace = "\u03b7"; break; | |
992 case '\u1f92': replace = "\u03b7"; break; | |
993 case '\u1f93': replace = "\u03b7"; break; | |
994 case '\u1f94': replace = "\u03b7"; break; | |
995 case '\u1f95': replace = "\u03b7"; break; | |
996 case '\u1f96': replace = "\u03b7"; break; | |
997 case '\u1f97': replace = "\u03b7"; break; | |
998 case '\u1f98': replace = "\u0397"; break; | |
999 case '\u1f99': replace = "\u0397"; break; | |
1000 case '\u1f9a': replace = "\u0397"; break; | |
1001 case '\u1f9b': replace = "\u0397"; break; | |
1002 case '\u1f9c': replace = "\u0397"; break; | |
1003 case '\u1f9d': replace = "\u0397"; break; | |
1004 case '\u1f9e': replace = "\u0397"; break; | |
1005 case '\u1f9f': replace = "\u0397"; break; | |
1006 case '\u1fa0': replace = "\u03c9"; break; | |
1007 case '\u1fa1': replace = "\u03c9"; break; | |
1008 case '\u1fa2': replace = "\u03c9"; break; | |
1009 case '\u1fa3': replace = "\u03c9"; break; | |
1010 case '\u1fa4': replace = "\u03c9"; break; | |
1011 case '\u1fa5': replace = "\u03c9"; break; | |
1012 case '\u1fa6': replace = "\u03c9"; break; | |
1013 case '\u1fa7': replace = "\u03c9"; break; | |
1014 case '\u1fa8': replace = "\u03a9"; break; | |
1015 case '\u1fa9': replace = "\u03a9"; break; | |
1016 case '\u1faa': replace = "\u03a9"; break; | |
1017 case '\u1fab': replace = "\u03a9"; break; | |
1018 case '\u1fac': replace = "\u03a9"; break; | |
1019 case '\u1fad': replace = "\u03a9"; break; | |
1020 case '\u1fae': replace = "\u03a9"; break; | |
1021 case '\u1faf': replace = "\u03a9"; break; | |
1022 case '\u1fb2': replace = "\u03b1"; break; | |
1023 case '\u1fb3': replace = "\u03b1"; break; | |
1024 case '\u1fb4': replace = "\u03b1"; break; | |
1025 case '\u1fb6': replace = "\u03b1"; break; | |
1026 case '\u1fb7': replace = "\u03b1"; break; | |
1027 case '\u1fba': replace = "\u0391"; break; | |
1028 case '\u1fbb': replace = "\u0391"; break; | |
1029 case '\u1fbc': replace = "\u0391"; break; | |
1030 case '\u1fc2': replace = "\u03b7"; break; | |
1031 case '\u1fc3': replace = "\u03b7"; break; | |
1032 case '\u1fc4': replace = "\u03b7"; break; | |
1033 case '\u1fc6': replace = "\u03b7"; break; | |
1034 case '\u1fc7': replace = "\u03b7"; break; | |
1035 case '\u1fca': replace = "\u0397"; break; | |
1036 case '\u1fcb': replace = "\u0397"; break; | |
1037 case '\u1fcc': replace = "\u0397"; break; | |
1038 case '\u1fd2': replace = "\u03b9"; break; | |
1039 case '\u1fd3': replace = "\u03b9"; break; | |
1040 case '\u1fd6': replace = "\u03b9"; break; | |
1041 case '\u1fd7': replace = "\u03b9"; break; | |
1042 case '\u1fda': replace = "\u0399"; break; | |
1043 case '\u1fdb': replace = "\u039f"; break; | |
1044 case '\u1fe2': replace = "\u03c5"; break; | |
1045 case '\u1fe3': replace = "\u03c5"; break; | |
1046 case '\u1fe4': replace = "\u03c1"; break; | |
1047 case '\u1fe5': replace = "\u03c1"; break; | |
1048 case '\u1fe6': replace = "\u03c5"; break; | |
1049 case '\u1fe7': replace = "\u03c5"; break; | |
1050 case '\u1fea': replace = "\u03a5"; break; | |
1051 case '\u1feb': replace = "\u03a5"; break; | |
1052 case '\u1fec': replace = "\u03a1"; break; | |
1053 case '\u1ff2': replace = "\u03c9"; break; | |
1054 case '\u1ff3': replace = "\u03c9"; break; | |
1055 case '\u1ff4': replace = "\u03c9"; break; | |
1056 case '\u1ff6': replace = "\u03c9"; break; | |
1057 case '\u1ff7': replace = "\u03c9"; break; | |
1058 case '\u1ff8': replace = "\u039f"; break; | |
1059 case '\u1ff9': replace = "\u039f"; break; | |
1060 case '\u1ffa': replace = "\u03a9"; break; | |
1061 case '\u1ffb': replace = "\u03a9"; break; | |
1062 case '\u1ffc': replace = "\u03a9"; break; | |
1063 | |
1064 case '<': break; | |
1065 case '>': break; | |
1066 case '-': break; // same treatment as soft hyphen | |
1067 case '\u00ad': break; // soft hyphen | |
1068 default: replace += c; break; | |
1069 } | |
1070 buf.append(replace); | |
1071 // update offsets if replacement is a different length | |
1072 if (offsets != null) { | |
1073 int r = replace.length(); | |
1074 if (r == 0) | |
1075 this.offsets = arrayKill(this.offsets, i - n); | |
1076 else if (r == 2) | |
1077 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
1078 n += 1 - r; | |
1079 } | |
1080 } | |
1081 return buf.toString(); | |
1082 } else { // unknown or no language | |
1083 return s; | |
1084 } | |
1085 } | |
1086 | |
1087 /* | |
1088 // explicit words | |
1089 normStr = normStr.replaceAll("aliàs", "alias"); | |
1090 normStr = normStr.replaceAll("hîc", "hic"); | |
1091 normStr = normStr.replaceAll("quòd", "quod"); | |
1092 normStr = normStr.replaceAll("Quòd", "Quod"); | |
1093 normStr = normStr.replaceAll("QVòd", "Quod"); | |
1094 normStr = normStr.replaceAll("Cùmque", "Cumque"); | |
1095 normStr = normStr.replaceAll("aër", "aer"); | |
1096 // ij | |
1097 normStr = normStr.replaceAll("ij", "ii"); | |
1098 // qu/qv | |
1099 normStr = normStr.replaceAll("qv", "qu"); | |
1100 // normStr = normStr.replaceAll("qV", "qU"); | |
1101 normStr = normStr.replaceAll("Qv", "Qu"); | |
1102 normStr = normStr.replaceAll("QV", "QU"); | |
1103 // u/v | |
1104 String vowels = getVowels(); | |
1105 String consonants = getConsonants(); | |
1106 normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel | |
1107 normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel | |
1108 normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant | |
1109 normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant | |
1110 normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant | |
1111 normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant | |
1112 // end of word: diacritica | |
1113 normStr = normStr.replaceAll("à$", "a"); | |
1114 normStr = normStr.replaceAll("è$", "e"); | |
1115 normStr = normStr.replaceAll("ò$", "o"); | |
1116 normStr = normStr.replaceAll("àm$", "am"); | |
1117 normStr = normStr.replaceAll("ùm$", "um"); | |
1118 String normStrTmp = normStr; | |
1119 normStr = ""; | |
1120 for (int i = 0; i < normStrTmp.length(); i++) { | |
1121 char c = normStrTmp.charAt(i); | |
1122 String replace = ""; | |
1123 switch (c) { | |
1124 case 'ſ': replace = "s"; break; | |
1125 case 'ß': replace = "ss"; break; | |
1126 case 'æ': replace = "ae"; break; | |
1127 case 'Æ': replace = "AE"; break; | |
1128 case 'ę': replace = "ae"; break; | |
1129 case 'œ': replace = "oe"; break; | |
1130 default: replace += c; break; | |
1131 } | |
1132 normStr = normStr + replace; | |
1133 } | |
1134 | |
1135 | |
1136 private String getVowels() { | |
1137 String retStr = null; | |
1138 if (Language.getInstance().isItalian(language)) { | |
1139 retStr = "AEIOUaeiou" + | |
1140 "\u00c6\u00e6" + // AE ligatures | |
1141 "\u0152\u0153"; // OE ligatures | |
1142 } else if (Language.getInstance().isLatin(language)) { | |
1143 retStr = "AEIOUaeiouÆœęàèòù"; | |
1144 } | |
1145 return retStr; | |
1146 } | |
1147 | |
1148 private String getConsonants() { | |
1149 String retStr = null; | |
1150 if (Language.getInstance().isItalian(language)) { | |
1151 retStr = "BCDFGHKLMNPQRSTVWXZ" + | |
1152 "bcdfghklmnpqrstvwxz" + | |
1153 "ſß"; // long/sharp S | |
1154 } else if (Language.getInstance().isLatin(language)) { | |
1155 retStr = "BCDFGHKLMNPQRSTVWXZ" + | |
1156 "bcdfghklmnpqrstvwxz" + | |
1157 "ſß"; // long/sharp S | |
1158 } | |
1159 return retStr; | |
1160 } | |
1161 | |
1162 | |
1163 | |
1164 | |
1165 | |
1166 * | |
1167 * | |
1168 * | |
1169 * | |
1170 */ | |
1171 | |
1172 | |
1173 | |
1174 | |
1175 | |
1176 | |
1177 /** | |
1178 * Returns a copy of an integer array with the element at | |
1179 * <code>index</code> removed ("killed"). | |
1180 * | |
1181 * @param array integer array | |
1182 * @param index index of element to remove | |
1183 */ | |
1184 private int[] arrayKill(int[] array, int index) { | |
1185 int[] newArray = new int[array.length - 1]; | |
1186 System.arraycopy(array, 0, newArray, 0, index); | |
1187 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); | |
1188 return newArray; | |
1189 } | |
1190 | |
1191 /** | |
1192 * Returns a copy of an integer array with <code>count</code> elements | |
1193 * inserted at <code>index</code>. | |
1194 * | |
1195 * @param array integer array | |
1196 * @param index index to insert new elements | |
1197 * @param value value to insert into new slots | |
1198 * @param count number of new slots to insert | |
1199 */ | |
1200 private int[] arrayInsert(int[] array, int index, int value, int count) { | |
1201 int[] newArray = new int[array.length + count]; | |
1202 System.arraycopy(array, 0, newArray, 0, index); | |
1203 for (int i = 0; i < count; i++) newArray[index + i] = value; | |
1204 System.arraycopy(array, index, newArray, index + count, array.length - index); | |
1205 return newArray; | |
1206 } | |
1207 | |
1208 } |