Mercurial > hg > mpdl-group
annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 1ec29fdd0db8 |
rev | line source |
---|---|
0 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; |
2 | |
3 import java.io.IOException; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
4 import java.io.StringReader; |
0 | 5 import java.util.ArrayList; |
6 | |
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; |
0 | 9 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; |
10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
12 | |
13 public class MpdlNormalizer { | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
14 public static int MODE_4LEXICA = 1; // normalization for lexica etc. which have sometimes only ascii in it |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
15 public static int MODE_4HUMAN_READERS = 2; // normalization for human readers |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
16 private int normMode = MODE_4LEXICA; // Default |
0 | 17 private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions |
18 private String language; | |
19 private int[] offsets; | |
20 | |
21 public MpdlNormalizer(String[] normFunctionsToUse, String lang) { | |
22 this.normFunctionsToUse = normFunctionsToUse; | |
23 String language = Language.getInstance().getLanguageId(lang); | |
24 this.language = language; | |
25 } | |
26 | |
27 public MpdlNormalizer(String language) { | |
28 this.language = language; | |
29 } | |
30 | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
31 public void setNormMode(int normMode) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
32 this.normMode = normMode; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
33 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
34 |
0 | 35 /** |
36 * Applies the normalization rules in <code>language</code> to | |
37 * <code>s</code>, without offset tracking. | |
38 * | |
39 * @param s source string | |
40 * @return normalized string | |
41 */ | |
42 public String normalize(String s) throws ApplicationException { | |
43 String normStr = s; | |
44 if (useRegFunction()) { | |
45 // try to regularize the string to the norm form over predefined regularizations | |
46 RegularizationManager regManager = RegularizationManager.getInstance(); | |
47 ArrayList<Regularization> regs = regManager.findRegsByOrig(language, s); | |
48 if (regs != null && regs.size() > 0) { | |
49 Regularization reg = regs.get(0); // only one: the first one | |
50 String regNormStr = reg.getNorm(); | |
51 normStr = regNormStr; | |
52 } | |
53 } | |
54 if (useNormFunction()) { | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
55 // normalize the string by string replacements |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
56 if (normMode == MODE_4LEXICA) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
57 normStr = normalize4Lexica(normStr, null); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
58 else if (normMode == MODE_4HUMAN_READERS) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
59 normStr = normalize4HumanReaders(normStr); |
0 | 60 } |
61 return normStr; | |
62 } | |
63 | |
64 private boolean useRegFunction() { | |
65 boolean useReg = false; | |
66 for (int i=0; i< normFunctionsToUse.length; i++) { | |
67 String function = normFunctionsToUse[i]; | |
68 if (function.equals("reg")) | |
69 return true; | |
70 } | |
71 return useReg; | |
72 } | |
73 | |
74 private boolean useNormFunction() { | |
75 boolean useNorm = false; | |
76 for (int i=0; i< normFunctionsToUse.length; i++) { | |
77 String function = normFunctionsToUse[i]; | |
78 if (function.equals("norm")) | |
79 return true; | |
80 } | |
81 return useNorm; | |
82 } | |
83 | |
84 /** | |
85 * Applies the normalization rules in <code>language</code> to | |
86 * <code>s</code>, with offset tracking.<p> | |
87 * | |
88 * <strong>WARNING:</strong> | |
89 * Arboreal will not work properly if a normalization substitution | |
90 * replaces a source character with more than two target characters! | |
91 * This is simply a BUG, and should be fixed. Fortunately, however, | |
92 * one does not often need such a replacement.<p> | |
93 * | |
94 * @param s source string | |
95 * @param offsets character offset table | |
96 * @return normalized string | |
97 */ | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
98 private String normalize4Lexica(String s, int[] offsets) { |
0 | 99 this.offsets = offsets; |
100 if (language.equals("la") || language.equals("lat")) { | |
101 StringBuffer buf = new StringBuffer(); | |
102 int n = 0; | |
103 for (int i = 0; i < s.length(); i++) { | |
104 char c = s.charAt(i); | |
105 String replace = new String(); | |
106 switch (c) { | |
107 case 'j': replace = "i"; break; | |
108 case 'v': replace = "u"; break; | |
109 /* | |
110 * Linguistic note: /u/ and /v/ are rarely phonemic | |
111 * in Latin, as in alui 's/he nourished' vs. | |
112 * alvi 'of a belly', volui 's/he wished' or 'it rolled' | |
113 * vs. volvi 'to be rolled', (in)seruit 's/he joined | |
114 * together' vs. (in)servit 's/he serves'. | |
115 */ | |
116 case 'q': | |
117 if ((i < s.length() - 1) && (s.charAt(i + 1) == ';')) | |
118 replace = "qu"; | |
119 else | |
120 replace = "q"; | |
121 break; | |
122 case ';': | |
123 if ((i > 0) && (s.charAt(i - 1) == 'q')) | |
124 replace = "e"; | |
125 else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1))) | |
126 replace = ";"; | |
127 else | |
128 replace = ""; | |
129 break; | |
130 case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT | |
131 case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT | |
132 case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT | |
133 | |
134 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
135 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
136 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
137 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
138 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
139 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
140 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
141 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
142 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
143 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
144 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
145 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
146 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
147 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
148 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
149 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
150 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
151 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
152 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
153 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
154 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
155 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
156 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
157 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
158 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
159 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
160 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
161 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
162 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
163 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
164 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
165 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
166 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
167 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
168 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
169 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
170 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
171 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
172 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
173 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
174 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
175 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
176 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
177 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
178 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
179 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
180 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
181 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
182 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
183 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
184 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
185 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
186 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
187 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
188 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
189 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
190 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
191 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
192 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
193 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
194 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
195 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
196 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
197 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
198 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
199 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
200 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
201 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
202 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
203 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
204 case '\u00ad': break; // soft hyphen | |
205 // new in MPDL project by J. Willenborg | |
206 case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
207 case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
208 case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
209 case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
210 case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
211 case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
212 case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
213 case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
214 case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
215 case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
216 case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
217 case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
218 case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
219 case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
220 case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
221 case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
222 case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
223 case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
224 case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
225 case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
226 case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
227 case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
228 case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
229 case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
230 case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
231 case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ... | |
232 // by Malcolm | |
233 case '\u2329': break; // BRA | |
234 case '\u232a': break; // KET | |
235 default: replace += c; break; | |
236 } | |
237 buf.append(replace); | |
238 // update offsets if replacement is a different length | |
239 if (offsets != null) { | |
240 int r = replace.length(); | |
241 if (r == 0) | |
242 this.offsets = arrayKill(this.offsets, i - n); | |
243 else if (r == 2) | |
244 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
245 n += 1 - r; | |
246 } | |
247 } | |
248 return buf.toString(); | |
249 } else if (language.equals("it")) { | |
250 // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported | |
251 StringBuffer buf = new StringBuffer(); | |
252 int n = 0; | |
253 for (int i = 0; i < s.length(); i++) { | |
254 char c = s.charAt(i); | |
255 String replace = new String(); | |
256 switch (c) { | |
257 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
258 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
259 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
260 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
261 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
262 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
263 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
264 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
265 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
266 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
267 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
268 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
269 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
270 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
271 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
272 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
273 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
274 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
275 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
276 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
277 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
278 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
279 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
280 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
281 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
282 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
283 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
284 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
285 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
286 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
287 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
288 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
289 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
290 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
291 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
292 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
293 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
294 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
295 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
296 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
297 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
298 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
299 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
300 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
301 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
302 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
303 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
304 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
305 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
306 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
307 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
308 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
309 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
310 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
311 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
312 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
313 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
314 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
315 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
316 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
317 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
318 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
319 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
320 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
321 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
322 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
323 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
324 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
325 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
326 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
327 // new in MPDL project by J. Willenborg | |
328 case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE | |
329 default: replace += c; break; | |
330 } | |
331 buf.append(replace); | |
332 // update offsets if replacement is a different length | |
333 if (offsets != null) { | |
334 int r = replace.length(); | |
335 if (r == 0) this.offsets = arrayKill(this.offsets, i - n); | |
336 else if (r == 2) | |
337 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
338 n += 1 - r; | |
339 } | |
340 } | |
341 return buf.toString(); | |
342 // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english | |
343 } else if (language.equals("en")) { | |
344 StringBuffer buf = new StringBuffer(); | |
345 int n = 0; | |
346 for (int i = 0; i < s.length(); i++) { | |
347 char c = s.charAt(i); | |
348 String replace = new String(); | |
349 switch (c) { | |
350 case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT | |
351 case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT | |
352 case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT | |
353 | |
354 case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE | |
355 case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE | |
356 case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX | |
357 case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS | |
358 case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E | |
359 case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA | |
360 case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE | |
361 case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE | |
362 case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX | |
363 case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS | |
364 case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE; | |
365 case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE | |
366 case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX | |
367 case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS | |
368 case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE | |
369 case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE | |
370 case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX | |
371 case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS | |
372 case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE | |
373 case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE | |
374 case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX | |
375 case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS | |
376 case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE | |
377 case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE | |
378 case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX | |
379 case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS | |
380 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
381 case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA | |
382 case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE | |
383 case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE | |
384 case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX | |
385 case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS | |
386 case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE | |
387 case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE | |
388 case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX | |
389 case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS | |
390 case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE | |
391 case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE | |
392 case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX | |
393 case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS | |
394 case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE | |
395 case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE | |
396 case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX | |
397 case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS | |
398 case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON | |
399 case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON | |
400 case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE | |
401 case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE | |
402 case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON | |
403 case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON | |
404 case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE | |
405 case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE | |
406 case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK | |
407 case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK | |
408 case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON | |
409 case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON | |
410 case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE | |
411 case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE | |
412 case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON | |
413 case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON | |
414 case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE | |
415 case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE | |
416 case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E | |
417 case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E | |
418 case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON | |
419 case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON | |
420 case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE | |
421 case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE | |
422 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
423 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
424 // new in MPDL project by J. Willenborg | |
425 case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE | |
426 // by Malcolm | |
427 case '\u00ad': break; // soft hyphen | |
428 case '\u2329': break; // BRA | |
429 case '\u232a': break; // KET | |
430 default: replace += c; break; | |
431 } | |
432 buf.append(replace); | |
433 // update offsets if replacement is a different length | |
434 if (offsets != null) { | |
435 int r = replace.length(); | |
436 if (r == 0) | |
437 this.offsets = arrayKill(this.offsets, i - n); | |
438 else if (r == 2) | |
439 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
440 n += 1 - r; | |
441 } | |
442 } | |
443 return buf.toString(); | |
444 } else if (language.equals("fr")) { | |
445 // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away | |
446 StringBuffer buf = new StringBuffer(); | |
447 int n = 0; | |
448 for (int i = 0; i < s.length(); i++) { | |
449 char c = s.charAt(i); | |
450 String replace = new String(); | |
451 switch (c) { | |
452 case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E | |
453 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S | |
454 case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S | |
455 case '\u00ad': break; // soft hyphen | |
456 case '-': break; | |
457 default: replace += c; break; | |
458 } | |
459 buf.append(replace); | |
460 // update offsets if replacement is a different length | |
461 if (offsets != null) { | |
462 int r = replace.length(); | |
463 if (r == 0) | |
464 this.offsets = arrayKill(this.offsets, i - n); | |
465 else if (r == 2) | |
466 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
467 n += 1 - r; | |
468 } | |
469 } | |
470 return buf.toString(); | |
471 } else if (language.equals("de")) { | |
472 StringBuffer buf = new StringBuffer(); | |
473 int n = 0; | |
474 for (int i = 0; i < s.length(); i++) { | |
475 char c = s.charAt(i); | |
476 String replace = new String(); | |
477 switch (c) { | |
478 case '\u00c4': replace = "Ae"; break; | |
479 case '\u00d6': replace = "Oe"; break; | |
480 case '\u00dc': replace = "Ue"; break; | |
481 case '\u00df': replace = "ss"; break; | |
482 case '\u00e4': replace = "ae"; break; | |
483 case '\u00f6': replace = "oe"; break; | |
484 case '\u00fc': replace = "ue"; break; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
485 case '\u00ad': break; // soft hyphen |
0 | 486 case '\u00e9': replace = "e"; break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
487 // new in MPDL project by J. Willenborg |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
488 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
489 // case '-': break; |
0 | 490 default: replace += c; break; |
491 } | |
492 buf.append(replace); | |
493 // update offsets if replacement is a different length | |
494 if (offsets != null) { | |
495 int r = replace.length(); | |
496 if (r == 0) | |
497 this.offsets = arrayKill(this.offsets, i - n); | |
498 else if (r == 2) | |
499 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
500 n += 1 - r; | |
501 } | |
502 } | |
503 return buf.toString(); | |
504 } else if (language.equals("zh")) { | |
505 StringBuffer buf = new StringBuffer(); | |
506 int n = 0; | |
507 for (int i = 0; i < s.length(); i++) { | |
508 char c = s.charAt(i); | |
509 String replace = new String(); | |
510 switch (c) { | |
511 case '\u00b9': replace = "1"; break; | |
512 case '\u00b2': replace = "2"; break; | |
513 case '\u00b3': replace = "3"; break; | |
514 case '\u2074': replace = "4"; break; | |
515 case '\u2075': replace = "5"; break; | |
516 // original by Malcolm Hyman: with the following replacements // TODO uncomment these 3 lines | |
517 // case '\u3000': replace = " "; break; | |
518 // case '\u3001': replace = ","; break; | |
519 // case '\u3002': replace = "."; break; | |
520 // case '\u200b': break; // BREAKS EVERYTHING! | |
521 default: replace += c; break; | |
522 } | |
523 buf.append(replace); | |
524 // update offsets if replacement is a different length | |
525 if (offsets != null) { | |
526 int r = replace.length(); | |
527 if (r == 0) | |
528 this.offsets = arrayKill(this.offsets, i - n); | |
529 else if (r == 2) | |
530 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
531 n += 1 - r; | |
532 } | |
533 } | |
534 return buf.toString(); | |
535 } else if (language.equals("akk") || | |
536 language.equals("qam") || | |
537 language.equals("qpc") || | |
538 language.equals("elx") || | |
539 language.equals("sux") || | |
540 language.equals("hit") || | |
541 language.equals("qhu") || | |
542 language.equals("peo") || | |
543 language.equals("uga") || | |
544 language.equals("ura") || | |
545 language.equals("qcu")) { | |
546 StringBuffer buf = new StringBuffer(); | |
547 int n = 0; | |
548 char last = '\u0000'; | |
549 for (int i = 0; i < s.length(); i++) { | |
550 char c = s.charAt(i); | |
551 c = Character.toLowerCase(c); | |
552 String replace = new String(); | |
553 switch (c) { | |
554 case '{': replace += "-"; break; | |
555 case '}': replace += "-"; break; | |
556 // These are from PSD::ATF::Unicode by Steve Tinney | |
557 case '\u0161': replace += "sz"; break; | |
558 case '\u1e63': replace += "s,"; break; | |
559 case '\u1e6d': replace += "t,"; break; | |
560 case '\u014b': replace += "j"; break; | |
561 case '\u015b': replace += "s'"; break; | |
562 case '\u2080': replace += "0"; break; | |
563 case '\u2081': replace += "1"; break; | |
564 case '\u2082': replace += "2"; break; | |
565 case '\u2083': replace += "3"; break; | |
566 case '\u2084': replace += "4"; break; | |
567 case '\u2085': replace += "5"; break; | |
568 case '\u2086': replace += "6"; break; | |
569 case '\u2087': replace += "7"; break; | |
570 case '\u2088': replace += "8"; break; | |
571 case '\u2089': replace += "9"; break; | |
572 | |
573 case 'c': // shin (except where used as modifier) | |
574 if ((i > 0) && ((last == '~') || (last == '@'))) | |
575 replace += "c"; | |
576 else replace += "sz"; | |
577 break; | |
578 default: replace += c; break; | |
579 } | |
580 // suppress grapheme boundary before or after word boundary | |
581 if (replace.equals("-")) { | |
582 if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' ')) | |
583 replace = ""; | |
584 } | |
585 last = c; | |
586 buf.append(replace); | |
587 // update offsets if replacement is a different length | |
588 if (offsets != null) { | |
589 int r = replace.length(); | |
590 if (r == 0) | |
591 this.offsets = arrayKill(this.offsets, i - n); | |
592 else if (r == 2) | |
593 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
594 n += 1 - r; | |
595 } | |
596 } | |
597 return buf.toString(); | |
598 } else if (language.equals("el") || language.equals("grc")) { | |
599 StringBuffer buf = new StringBuffer(); | |
600 int n = 0; | |
601 for (int i = 0; i < s.length(); i++) { | |
602 char c = s.charAt(i); | |
603 String replace = new String(); | |
604 switch (c) { | |
605 case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA | |
606 case '<': break; | |
607 case '>': break; | |
608 case '[': break; | |
609 case ']': break; | |
610 case '1': break; | |
611 case '2': break; | |
612 case '\u03ac': replace = "\u1f71"; break; | |
613 case '\u03ad': replace = "\u1f73"; break; | |
614 case '\u03ae': replace = "\u1f75"; break; | |
615 case '\u03af': replace = "\u1f77"; break; | |
616 case '\u03cc': replace = "\u1f79"; break; | |
617 case '\u03cd': replace = "\u1f7b"; break; | |
618 case '\u03ce': replace = "\u1f7d"; break; | |
619 case '-': break; // same treatment as soft hyphen | |
620 case '\u00ad': break; // soft hyphen | |
621 default: replace += c; break; | |
622 } | |
623 buf.append(replace); | |
624 // update offsets if replacement is a different length | |
625 if (offsets != null) { | |
626 int r = replace.length(); | |
627 if (r == 0) | |
628 this.offsets = arrayKill(this.offsets, i - n); | |
629 else if (r == 2) | |
630 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
631 n += 1 - r; | |
632 } | |
633 } | |
634 return buf.toString(); | |
635 } else if (language.equals("el_atonic")) { | |
636 StringBuffer buf = new StringBuffer(); | |
637 int n = 0; | |
638 for (int i = 0; i < s.length(); i++) { | |
639 char c = s.charAt(i); | |
640 String replace = new String(); | |
641 switch (c) { | |
642 case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA | |
643 // map characters with diacritics to their plain equivalent | |
644 // cf. <code>BetaCode.java</code> | |
645 case '\u03aa': replace = "\u0399"; break; | |
646 case '\u03ab': replace = "\u03a5"; break; | |
647 case '\u03ac': replace = "\u0381"; break; | |
648 case '\u03ad': replace = "\u0385"; break; | |
649 case '\u03ae': replace = "\u0387"; break; | |
650 case '\u03af': replace = "\u0389"; break; | |
651 case '\u03ca': replace = "\u03b9"; break; | |
652 case '\u03cb': replace = "\u03c5"; break; | |
653 case '\u03cc': replace = "\u03bf"; break; | |
654 case '\u03cd': replace = "\u03c5"; break; | |
655 case '\u03ce': replace = "\u03c9"; break; | |
656 case '\u1f00': replace = "\u03b1"; break; | |
657 case '\u1f01': replace = "\u03b1"; break; | |
658 case '\u1f02': replace = "\u03b1"; break; | |
659 case '\u1f03': replace = "\u03b1"; break; | |
660 case '\u1f04': replace = "\u03b1"; break; | |
661 case '\u1f05': replace = "\u03b1"; break; | |
662 case '\u1f06': replace = "\u03b1"; break; | |
663 case '\u1f07': replace = "\u03b1"; break; | |
664 case '\u1f08': replace = "\u0391"; break; | |
665 case '\u1f09': replace = "\u0391"; break; | |
666 case '\u1f0a': replace = "\u0391"; break; | |
667 case '\u1f0b': replace = "\u0391"; break; | |
668 case '\u1f0c': replace = "\u0391"; break; | |
669 case '\u1f0d': replace = "\u0391"; break; | |
670 case '\u1f0e': replace = "\u0391"; break; | |
671 case '\u1f0f': replace = "\u0391"; break; | |
672 case '\u1f10': replace = "\u03b5"; break; | |
673 case '\u1f11': replace = "\u03b5"; break; | |
674 case '\u1f12': replace = "\u03b5"; break; | |
675 case '\u1f13': replace = "\u03b5"; break; | |
676 case '\u1f14': replace = "\u03b5"; break; | |
677 case '\u1f15': replace = "\u03b5"; break; | |
678 case '\u1f18': replace = "\u0395"; break; | |
679 case '\u1f19': replace = "\u0395"; break; | |
680 case '\u1f1a': replace = "\u0395"; break; | |
681 case '\u1f1b': replace = "\u0395"; break; | |
682 case '\u1f1c': replace = "\u0395"; break; | |
683 case '\u1f1d': replace = "\u0395"; break; | |
684 case '\u1f20': replace = "\u03b7"; break; | |
685 case '\u1f21': replace = "\u03b7"; break; | |
686 case '\u1f22': replace = "\u03b7"; break; | |
687 case '\u1f23': replace = "\u03b7"; break; | |
688 case '\u1f24': replace = "\u03b7"; break; | |
689 case '\u1f25': replace = "\u03b7"; break; | |
690 case '\u1f26': replace = "\u03b7"; break; | |
691 case '\u1f27': replace = "\u03b7"; break; | |
692 case '\u1f28': replace = "\u0397"; break; | |
693 case '\u1f29': replace = "\u0397"; break; | |
694 case '\u1f2a': replace = "\u0397"; break; | |
695 case '\u1f2b': replace = "\u0397"; break; | |
696 case '\u1f2c': replace = "\u0397"; break; | |
697 case '\u1f2d': replace = "\u0397"; break; | |
698 case '\u1f2e': replace = "\u0397"; break; | |
699 case '\u1f2f': replace = "\u0397"; break; | |
700 case '\u1f30': replace = "\u03b9"; break; | |
701 case '\u1f31': replace = "\u03b9"; break; | |
702 case '\u1f32': replace = "\u03b9"; break; | |
703 case '\u1f33': replace = "\u03b9"; break; | |
704 case '\u1f34': replace = "\u03b9"; break; | |
705 case '\u1f35': replace = "\u03b9"; break; | |
706 case '\u1f36': replace = "\u03b9"; break; | |
707 case '\u1f37': replace = "\u03b9"; break; | |
708 case '\u1f38': replace = "\u0399"; break; | |
709 case '\u1f39': replace = "\u0399"; break; | |
710 case '\u1f3a': replace = "\u0399"; break; | |
711 case '\u1f3b': replace = "\u0399"; break; | |
712 case '\u1f3c': replace = "\u0399"; break; | |
713 case '\u1f3d': replace = "\u0399"; break; | |
714 case '\u1f3e': replace = "\u0399"; break; | |
715 case '\u1f3f': replace = "\u0399"; break; | |
716 case '\u1f40': replace = "\u03bf"; break; | |
717 case '\u1f41': replace = "\u03bf"; break; | |
718 case '\u1f42': replace = "\u03bf"; break; | |
719 case '\u1f43': replace = "\u03bf"; break; | |
720 case '\u1f44': replace = "\u03bf"; break; | |
721 case '\u1f45': replace = "\u03bf"; break; | |
722 case '\u1f48': replace = "\u039f"; break; | |
723 case '\u1f49': replace = "\u039f"; break; | |
724 case '\u1f4a': replace = "\u039f"; break; | |
725 case '\u1f4b': replace = "\u039f"; break; | |
726 case '\u1f4c': replace = "\u039f"; break; | |
727 case '\u1f4d': replace = "\u039f"; break; | |
728 case '\u1f50': replace = "\u03c5"; break; | |
729 case '\u1f51': replace = "\u03c5"; break; | |
730 case '\u1f52': replace = "\u03c5"; break; | |
731 case '\u1f53': replace = "\u03c5"; break; | |
732 case '\u1f54': replace = "\u03c5"; break; | |
733 case '\u1f55': replace = "\u03c5"; break; | |
734 case '\u1f56': replace = "\u03c5"; break; | |
735 case '\u1f57': replace = "\u03c5"; break; | |
736 case '\u1f58': replace = "\u03a5"; break; | |
737 case '\u1f59': replace = "\u03a5"; break; | |
738 case '\u1f5a': replace = "\u03a5"; break; | |
739 case '\u1f5b': replace = "\u03a5"; break; | |
740 case '\u1f5c': replace = "\u03a5"; break; | |
741 case '\u1f5d': replace = "\u03a5"; break; | |
742 case '\u1f5e': replace = "\u03a5"; break; | |
743 case '\u1f5f': replace = "\u03a5"; break; | |
744 case '\u1f60': replace = "\u03c9"; break; | |
745 case '\u1f61': replace = "\u03c9"; break; | |
746 case '\u1f62': replace = "\u03c9"; break; | |
747 case '\u1f63': replace = "\u03c9"; break; | |
748 case '\u1f64': replace = "\u03c9"; break; | |
749 case '\u1f65': replace = "\u03c9"; break; | |
750 case '\u1f66': replace = "\u03c9"; break; | |
751 case '\u1f67': replace = "\u03c9"; break; | |
752 case '\u1f68': replace = "\u03a9"; break; | |
753 case '\u1f69': replace = "\u03a9"; break; | |
754 case '\u1f6a': replace = "\u03a9"; break; | |
755 case '\u1f6b': replace = "\u03a9"; break; | |
756 case '\u1f6c': replace = "\u03a9"; break; | |
757 case '\u1f6d': replace = "\u03a9"; break; | |
758 case '\u1f6e': replace = "\u03a9"; break; | |
759 case '\u1f6f': replace = "\u03a9"; break; | |
760 case '\u1f70': replace = "\u03b1"; break; | |
761 case '\u1f71': replace = "\u03b1"; break; | |
762 case '\u1f72': replace = "\u03b5"; break; | |
763 case '\u1f73': replace = "\u03b5"; break; | |
764 case '\u1f74': replace = "\u03b7"; break; | |
765 case '\u1f75': replace = "\u03b7"; break; | |
766 case '\u1f76': replace = "\u03b9"; break; | |
767 case '\u1f77': replace = "\u03b9"; break; | |
768 case '\u1f78': replace = "\u03bf"; break; | |
769 case '\u1f79': replace = "\u03bf"; break; | |
770 case '\u1f7a': replace = "\u03c5"; break; | |
771 case '\u1f7b': replace = "\u03c5"; break; | |
772 case '\u1f7c': replace = "\u03c9"; break; | |
773 case '\u1f7d': replace = "\u03c9"; break; | |
774 case '\u1f80': replace = "\u03b1"; break; | |
775 case '\u1f81': replace = "\u03b1"; break; | |
776 case '\u1f82': replace = "\u03b1"; break; | |
777 case '\u1f83': replace = "\u03b1"; break; | |
778 case '\u1f84': replace = "\u03b1"; break; | |
779 case '\u1f85': replace = "\u03b1"; break; | |
780 case '\u1f86': replace = "\u03b1"; break; | |
781 case '\u1f87': replace = "\u03b1"; break; | |
782 case '\u1f88': replace = "\u0391"; break; | |
783 case '\u1f89': replace = "\u0391"; break; | |
784 case '\u1f8a': replace = "\u0391"; break; | |
785 case '\u1f8b': replace = "\u0391"; break; | |
786 case '\u1f8c': replace = "\u0391"; break; | |
787 case '\u1f8d': replace = "\u0391"; break; | |
788 case '\u1f8e': replace = "\u0391"; break; | |
789 case '\u1f8f': replace = "\u0391"; break; | |
790 case '\u1f90': replace = "\u03b7"; break; | |
791 case '\u1f91': replace = "\u03b7"; break; | |
792 case '\u1f92': replace = "\u03b7"; break; | |
793 case '\u1f93': replace = "\u03b7"; break; | |
794 case '\u1f94': replace = "\u03b7"; break; | |
795 case '\u1f95': replace = "\u03b7"; break; | |
796 case '\u1f96': replace = "\u03b7"; break; | |
797 case '\u1f97': replace = "\u03b7"; break; | |
798 case '\u1f98': replace = "\u0397"; break; | |
799 case '\u1f99': replace = "\u0397"; break; | |
800 case '\u1f9a': replace = "\u0397"; break; | |
801 case '\u1f9b': replace = "\u0397"; break; | |
802 case '\u1f9c': replace = "\u0397"; break; | |
803 case '\u1f9d': replace = "\u0397"; break; | |
804 case '\u1f9e': replace = "\u0397"; break; | |
805 case '\u1f9f': replace = "\u0397"; break; | |
806 case '\u1fa0': replace = "\u03c9"; break; | |
807 case '\u1fa1': replace = "\u03c9"; break; | |
808 case '\u1fa2': replace = "\u03c9"; break; | |
809 case '\u1fa3': replace = "\u03c9"; break; | |
810 case '\u1fa4': replace = "\u03c9"; break; | |
811 case '\u1fa5': replace = "\u03c9"; break; | |
812 case '\u1fa6': replace = "\u03c9"; break; | |
813 case '\u1fa7': replace = "\u03c9"; break; | |
814 case '\u1fa8': replace = "\u03a9"; break; | |
815 case '\u1fa9': replace = "\u03a9"; break; | |
816 case '\u1faa': replace = "\u03a9"; break; | |
817 case '\u1fab': replace = "\u03a9"; break; | |
818 case '\u1fac': replace = "\u03a9"; break; | |
819 case '\u1fad': replace = "\u03a9"; break; | |
820 case '\u1fae': replace = "\u03a9"; break; | |
821 case '\u1faf': replace = "\u03a9"; break; | |
822 case '\u1fb2': replace = "\u03b1"; break; | |
823 case '\u1fb3': replace = "\u03b1"; break; | |
824 case '\u1fb4': replace = "\u03b1"; break; | |
825 case '\u1fb6': replace = "\u03b1"; break; | |
826 case '\u1fb7': replace = "\u03b1"; break; | |
827 case '\u1fba': replace = "\u0391"; break; | |
828 case '\u1fbb': replace = "\u0391"; break; | |
829 case '\u1fbc': replace = "\u0391"; break; | |
830 case '\u1fc2': replace = "\u03b7"; break; | |
831 case '\u1fc3': replace = "\u03b7"; break; | |
832 case '\u1fc4': replace = "\u03b7"; break; | |
833 case '\u1fc6': replace = "\u03b7"; break; | |
834 case '\u1fc7': replace = "\u03b7"; break; | |
835 case '\u1fca': replace = "\u0397"; break; | |
836 case '\u1fcb': replace = "\u0397"; break; | |
837 case '\u1fcc': replace = "\u0397"; break; | |
838 case '\u1fd2': replace = "\u03b9"; break; | |
839 case '\u1fd3': replace = "\u03b9"; break; | |
840 case '\u1fd6': replace = "\u03b9"; break; | |
841 case '\u1fd7': replace = "\u03b9"; break; | |
842 case '\u1fda': replace = "\u0399"; break; | |
843 case '\u1fdb': replace = "\u039f"; break; | |
844 case '\u1fe2': replace = "\u03c5"; break; | |
845 case '\u1fe3': replace = "\u03c5"; break; | |
846 case '\u1fe4': replace = "\u03c1"; break; | |
847 case '\u1fe5': replace = "\u03c1"; break; | |
848 case '\u1fe6': replace = "\u03c5"; break; | |
849 case '\u1fe7': replace = "\u03c5"; break; | |
850 case '\u1fea': replace = "\u03a5"; break; | |
851 case '\u1feb': replace = "\u03a5"; break; | |
852 case '\u1fec': replace = "\u03a1"; break; | |
853 case '\u1ff2': replace = "\u03c9"; break; | |
854 case '\u1ff3': replace = "\u03c9"; break; | |
855 case '\u1ff4': replace = "\u03c9"; break; | |
856 case '\u1ff6': replace = "\u03c9"; break; | |
857 case '\u1ff7': replace = "\u03c9"; break; | |
858 case '\u1ff8': replace = "\u039f"; break; | |
859 case '\u1ff9': replace = "\u039f"; break; | |
860 case '\u1ffa': replace = "\u03a9"; break; | |
861 case '\u1ffb': replace = "\u03a9"; break; | |
862 case '\u1ffc': replace = "\u03a9"; break; | |
863 | |
864 case '<': break; | |
865 case '>': break; | |
866 case '-': break; // same treatment as soft hyphen | |
867 case '\u00ad': break; // soft hyphen | |
868 default: replace += c; break; | |
869 } | |
870 buf.append(replace); | |
871 // update offsets if replacement is a different length | |
872 if (offsets != null) { | |
873 int r = replace.length(); | |
874 if (r == 0) | |
875 this.offsets = arrayKill(this.offsets, i - n); | |
876 else if (r == 2) | |
877 this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1); | |
878 n += 1 - r; | |
879 } | |
880 } | |
881 return buf.toString(); | |
882 } else { // unknown or no language | |
883 return s; | |
884 } | |
885 } | |
886 | |
887 public String deNormalizeToRegExpr(String s) { | |
888 // TODO all characters in all languages | |
889 if (language.equals("la") || language.equals("lat")) { | |
890 StringBuffer buf = new StringBuffer(); | |
891 if (s.indexOf("ae") != -1) { | |
892 String str1 = s; | |
893 str1 = str1.replaceAll("ae", "\u0119"); | |
894 String str2 = s; | |
895 str2 = str2.replaceAll("ae", "\u00c6"); | |
896 String str3 = s; | |
897 str3 = str3.replaceAll("ae", "\u00e6"); | |
898 buf.append(str1 + "|" + str2 + "|" + str3 + "|"); | |
899 } | |
900 if (s.indexOf("oe") != -1) { | |
901 String str1 = s; | |
902 str1 = str1.replaceAll("oe", "\u0152"); | |
903 String str2 = s; | |
904 str2 = str2.replaceAll("oe", "\u0153"); | |
905 buf.append(str1 + "|" + str2 + "|"); | |
906 } | |
907 if (s.indexOf("ss") != -1) { | |
908 String str1 = s; | |
909 str1 = str1.replaceAll("ss", "\u00df"); | |
910 buf.append(str1 + "|"); | |
911 } | |
912 boolean beginWord = true; | |
913 for (int i = 0; i < s.length(); i++) { | |
914 char c = s.charAt(i); | |
915 if (! beginWord) | |
916 c = Character.toLowerCase(c); | |
917 beginWord = Character.isWhitespace(c); | |
918 String replace = new String(); | |
919 switch (c) { | |
920 case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; | |
921 case 'c': replace = "[c\u00c7\u00e7]"; break; | |
922 case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1ebd]"; break; | |
923 case 'i': replace = "[ij\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break; | |
924 case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]"; break; | |
925 case 'u': replace = "[uv\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; | |
926 case 's': replace = "[s\u017f]"; break; | |
927 default: replace += c; break; | |
928 } | |
929 buf.append(replace); | |
930 } | |
931 return buf.toString(); | |
932 } else if (language.equals("en")) { | |
933 StringBuffer buf = new StringBuffer(); | |
934 if (s.indexOf("ae") != -1) { | |
935 String str1 = s; | |
936 str1 = str1.replaceAll("ae", "\u0119"); | |
937 String str2 = s; | |
938 str2 = str2.replaceAll("ae", "\u00c6"); | |
939 String str3 = s; | |
940 str3 = str3.replaceAll("ae", "\u00e6"); | |
941 buf.append(str1 + "|" + str2 + "|" + str3 + "|"); | |
942 } | |
943 if (s.indexOf("oe") != -1) { | |
944 String str1 = s; | |
945 str1 = str1.replaceAll("oe", "\u0152"); | |
946 String str2 = s; | |
947 str2 = str2.replaceAll("oe", "\u0153"); | |
948 buf.append(str1 + "|" + str2 + "|"); | |
949 } | |
950 if (s.indexOf("ss") != -1) { | |
951 String str1 = s; | |
952 str1 = str1.replaceAll("ss", "\u00df"); | |
953 buf.append(str1 + "|"); | |
954 } | |
955 boolean beginWord = true; | |
956 for (int i = 0; i < s.length(); i++) { | |
957 char c = s.charAt(i); | |
958 if (! beginWord) | |
959 c = Character.toLowerCase(c); | |
960 beginWord = Character.isWhitespace(c); | |
961 String replace = new String(); | |
962 switch (c) { | |
963 case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; | |
964 case 'c': replace = "[c\u00c7\u00e7]"; break; | |
965 case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1e8d]"; break; | |
966 case 'i': replace = "[i\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break; | |
967 case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]‚"; break; | |
968 case 'u': replace = "[u\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; | |
969 case 's': replace = "[s\u017f]"; break; | |
970 default: replace += c; break; | |
971 } | |
972 buf.append(replace); | |
973 } | |
974 return buf.toString(); | |
975 } else if (language.equals("de")) { | |
976 StringBuffer buf = new StringBuffer(); | |
977 if (s.indexOf("ss") != -1) { | |
978 String str1 = s; | |
979 str1 = str1.replaceAll("ss", "\u00df"); | |
980 buf.append(str1 + "|"); | |
981 } | |
982 if (s.indexOf("ae") != -1) { | |
983 String str1 = s; | |
984 str1 = str1.replaceAll("ae", "\u00e4"); | |
985 buf.append(str1 + "|"); | |
986 } | |
987 if (s.indexOf("oe") != -1) { | |
988 String str1 = s; | |
989 str1 = str1.replaceAll("oe", "\u00f6"); | |
990 buf.append(str1 + "|"); | |
991 } | |
992 if (s.indexOf("ue") != -1) { | |
993 String str1 = s; | |
994 str1 = str1.replaceAll("ue", "\u00fc"); | |
995 buf.append(str1 + "|"); | |
996 } | |
997 boolean beginWord = true; | |
998 for (int i = 0; i < s.length(); i++) { | |
999 char c = s.charAt(i); | |
1000 if (! beginWord) | |
1001 c = Character.toLowerCase(c); | |
1002 beginWord = Character.isWhitespace(c); | |
1003 String replace = new String(); | |
1004 switch (c) { | |
1005 case 'e': replace = "[e\u00e9]"; break; | |
1006 default: replace += c; break; | |
1007 } | |
1008 buf.append(replace); | |
1009 } | |
1010 return buf.toString(); | |
1011 } else { // unknown or no language | |
1012 return s; | |
1013 } | |
1014 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1015 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1016 private String normalize4HumanReaders(String s) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1017 String normStr = s; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1018 StringReader strReader = new StringReader(normStr + "\n"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1019 MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1020 if (Language.getInstance().isLatin(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1021 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1022 } else if (Language.getInstance().isChinese(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1023 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1024 } else { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1025 // TODO normalization for all languages |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1026 return normalize4Lexica(s, null); // old function |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1027 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1028 String retStr = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1029 String token = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1030 while (token != null) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1031 try { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1032 token = mpdlNormalizerLexAll.yylex(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1033 if (token != null) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1034 retStr += token; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1035 } catch (IOException e ) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1036 // nothing cause IOException is not needed for a StringReader |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1037 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1038 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1039 normStr = retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1040 return normStr; |
0 | 1041 } |
1042 | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1043 /* |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1044 // explicit words |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1045 normStr = normStr.replaceAll("aliàs", "alias"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1046 normStr = normStr.replaceAll("hîc", "hic"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1047 normStr = normStr.replaceAll("quòd", "quod"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1048 normStr = normStr.replaceAll("Quòd", "Quod"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1049 normStr = normStr.replaceAll("QVòd", "Quod"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1050 normStr = normStr.replaceAll("Cùmque", "Cumque"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1051 normStr = normStr.replaceAll("aër", "aer"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1052 // ij |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1053 normStr = normStr.replaceAll("ij", "ii"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1054 // qu/qv |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1055 normStr = normStr.replaceAll("qv", "qu"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1056 // normStr = normStr.replaceAll("qV", "qU"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1057 normStr = normStr.replaceAll("Qv", "Qu"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1058 normStr = normStr.replaceAll("QV", "QU"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1059 // u/v |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1060 String vowels = getVowels(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1061 String consonants = getConsonants(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1062 normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1063 normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1064 normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1065 normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1066 normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1067 normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1068 // end of word: diacritica |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1069 normStr = normStr.replaceAll("à$", "a"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1070 normStr = normStr.replaceAll("è$", "e"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1071 normStr = normStr.replaceAll("ò$", "o"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1072 normStr = normStr.replaceAll("àm$", "am"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1073 normStr = normStr.replaceAll("ùm$", "um"); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1074 String normStrTmp = normStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1075 normStr = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1076 for (int i = 0; i < normStrTmp.length(); i++) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1077 char c = normStrTmp.charAt(i); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1078 String replace = ""; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1079 switch (c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1080 case 'ſ': replace = "s"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1081 case 'ß': replace = "ss"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1082 case 'æ': replace = "ae"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1083 case 'Æ': replace = "AE"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1084 case 'ę': replace = "ae"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1085 case 'œ': replace = "oe"; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1086 default: replace += c; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1087 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1088 normStr = normStr + replace; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1089 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1090 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1091 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1092 private String getVowels() { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1093 String retStr = null; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1094 if (Language.getInstance().isItalian(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1095 retStr = "AEIOUaeiou" + |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1096 "\u00c6\u00e6" + // AE ligatures |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1097 "\u0152\u0153"; // OE ligatures |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1098 } else if (Language.getInstance().isLatin(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1099 retStr = "AEIOUaeiouÆœęàèòù"; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1100 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1101 // TODO all languages |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1102 return retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1103 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1104 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1105 private String getConsonants() { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1106 String retStr = null; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1107 if (Language.getInstance().isItalian(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1108 retStr = "BCDFGHKLMNPQRSTVWXZ" + |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1109 "bcdfghklmnpqrstvwxz" + |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1110 "ſß"; // long/sharp S |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1111 } else if (Language.getInstance().isLatin(language)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1112 retStr = "BCDFGHKLMNPQRSTVWXZ" + |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1113 "bcdfghklmnpqrstvwxz" + |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1114 "ſß"; // long/sharp S |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1115 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1116 // TODO all languages |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1117 return retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1118 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1119 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1120 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1121 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1122 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1123 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1124 * |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1125 * |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1126 * |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1127 * |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1128 */ |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1129 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1130 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1131 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1132 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1133 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1134 |
0 | 1135 /** |
1136 * Returns a copy of an integer array with the element at | |
1137 * <code>index</code> removed ("killed"). | |
1138 * | |
1139 * @param array integer array | |
1140 * @param index index of element to remove | |
1141 */ | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1142 private int[] arrayKill(int[] array, int index) { |
0 | 1143 int[] newArray = new int[array.length - 1]; |
1144 System.arraycopy(array, 0, newArray, 0, index); | |
1145 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); | |
1146 return newArray; | |
1147 } | |
1148 | |
1149 /** | |
1150 * Returns a copy of an integer array with <code>count</code> elements | |
1151 * inserted at <code>index</code>. | |
1152 * | |
1153 * @param array integer array | |
1154 * @param index index to insert new elements | |
1155 * @param value value to insert into new slots | |
1156 * @param count number of new slots to insert | |
1157 */ | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
1158 private int[] arrayInsert(int[] array, int index, int value, int count) { |
0 | 1159 int[] newArray = new int[array.length + count]; |
1160 System.arraycopy(array, 0, newArray, 0, index); | |
1161 for (int i = 0; i < count; i++) newArray[index + i] = value; | |
1162 System.arraycopy(array, index, newArray, index + count, array.length - index); | |
1163 return newArray; | |
1164 } | |
1165 | |
1166 } |