Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/DutchStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
2 | |
3 /** | |
4 * Licensed to the Apache Software Foundation (ASF) under one or more | |
5 * contributor license agreements. See the NOTICE file distributed with | |
6 * this work for additional information regarding copyright ownership. | |
7 * The ASF licenses this file to You under the Apache License, Version 2.0 | |
8 * (the "License"); you may not use this file except in compliance with | |
9 * the License. You may obtain a copy of the License at | |
10 * | |
11 * http://www.apache.org/licenses/LICENSE-2.0 | |
12 * | |
13 * Unless required by applicable law or agreed to in writing, software | |
14 * distributed under the License is distributed on an "AS IS" BASIS, | |
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
16 * See the License for the specific language governing permissions and | |
17 * limitations under the License. | |
18 */ | |
19 | |
20 import java.util.Map; | |
21 | |
22 /** | |
23 * | |
24 * A stemmer for Dutch words. The algorithm is an implementation of | |
25 * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a> | |
26 * algorithm in Martin Porter's snowball project. | |
27 * | |
28 * @author Edwin de Jonge (ejne at cbs.nl) | |
29 */ | |
30 | |
31 public class DutchStemmer { | |
32 /** | |
33 * Buffer for the terms while stemming them. | |
34 */ | |
35 private StringBuffer sb = new StringBuffer(); | |
36 private boolean _removedE; | |
37 private Map _stemDict; | |
38 | |
39 private int _R1; | |
40 private int _R2; | |
41 | |
42 //TODO convert to internal | |
43 /* | |
44 * Stemms the given term to an unique <tt>discriminator</tt>. | |
45 * | |
46 * @param term The term that should be stemmed. | |
47 * @return Discriminator for <tt>term</tt> | |
48 */ | |
49 public String stem(String term) { | |
50 term = term.toLowerCase(); | |
51 if (!isStemmable(term)) | |
52 return term; | |
53 if (_stemDict != null && _stemDict.containsKey(term)) | |
54 if (_stemDict.get(term) instanceof String) | |
55 return (String) _stemDict.get(term); | |
56 else | |
57 return null; | |
58 | |
59 // Reset the StringBuffer. | |
60 sb.delete(0, sb.length()); | |
61 sb.insert(0, term); | |
62 // Stemming starts here... | |
63 substitute(sb); | |
64 storeYandI(sb); | |
65 _R1 = getRIndex(sb, 0); | |
66 _R1 = Math.max(3, _R1); | |
67 step1(sb); | |
68 step2(sb); | |
69 _R2 = getRIndex(sb, _R1); | |
70 step3a(sb); | |
71 step3b(sb); | |
72 step4(sb); | |
73 reStoreYandI(sb); | |
74 return sb.toString(); | |
75 } | |
76 | |
77 private boolean enEnding(StringBuffer sb) { | |
78 String[] enend = new String[]{"ene", "en"}; | |
79 for (int i = 0; i < enend.length; i++) { | |
80 String end = enend[i]; | |
81 String s = sb.toString(); | |
82 int index = s.length() - end.length(); | |
83 if (s.endsWith(end) && | |
84 index >= _R1 && | |
85 isValidEnEnding(sb, index - 1) | |
86 ) { | |
87 sb.delete(index, index + end.length()); | |
88 unDouble(sb, index); | |
89 return true; | |
90 } | |
91 } | |
92 return false; | |
93 } | |
94 | |
95 | |
96 private void step1(StringBuffer sb) { | |
97 if (_R1 >= sb.length()) | |
98 return; | |
99 | |
100 String s = sb.toString(); | |
101 int lengthR1 = sb.length() - _R1; | |
102 int index; | |
103 | |
104 if (s.endsWith("heden")) { | |
105 sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); | |
106 return; | |
107 } | |
108 | |
109 if (enEnding(sb)) | |
110 return; | |
111 | |
112 if (s.endsWith("se") && | |
113 (index = s.length() - 2) >= _R1 && | |
114 isValidSEnding(sb, index - 1) | |
115 ) { | |
116 sb.delete(index, index + 2); | |
117 return; | |
118 } | |
119 if (s.endsWith("s") && | |
120 (index = s.length() - 1) >= _R1 && | |
121 isValidSEnding(sb, index - 1)) { | |
122 sb.delete(index, index + 1); | |
123 } | |
124 } | |
125 | |
126 /** | |
127 * Delete suffix e if in R1 and | |
128 * preceded by a non-vowel, and then undouble the ending | |
129 * | |
130 * @param sb String being stemmed | |
131 */ | |
132 private void step2(StringBuffer sb) { | |
133 _removedE = false; | |
134 if (_R1 >= sb.length()) | |
135 return; | |
136 String s = sb.toString(); | |
137 int index = s.length() - 1; | |
138 if (index >= _R1 && | |
139 s.endsWith("e") && | |
140 !isVowel(sb.charAt(index - 1))) { | |
141 sb.delete(index, index + 1); | |
142 unDouble(sb); | |
143 _removedE = true; | |
144 } | |
145 } | |
146 | |
147 /** | |
148 * Delete "heid" | |
149 * | |
150 * @param sb String being stemmed | |
151 */ | |
152 private void step3a(StringBuffer sb) { | |
153 if (_R2 >= sb.length()) | |
154 return; | |
155 String s = sb.toString(); | |
156 int index = s.length() - 4; | |
157 if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') { | |
158 sb.delete(index, index + 4); //remove heid | |
159 enEnding(sb); | |
160 } | |
161 } | |
162 | |
163 /** | |
164 * <p>A d-suffix, or derivational suffix, enables a new word, | |
165 * often with a different grammatical category, or with a different | |
166 * sense, to be built from another word. Whether a d-suffix can be | |
167 * attached is discovered not from the rules of grammar, but by | |
168 * referring to a dictionary. So in English, ness can be added to | |
169 * certain adjectives to form corresponding nouns (littleness, | |
170 * kindness, foolishness ...) but not to all adjectives | |
171 * (not for example, to big, cruel, wise ...) d-suffixes can be | |
172 * used to change meaning, often in rather exotic ways.</p> | |
173 * Remove "ing", "end", "ig", "lijk", "baar" and "bar" | |
174 * | |
175 * @param sb String being stemmed | |
176 */ | |
177 private void step3b(StringBuffer sb) { | |
178 if (_R2 >= sb.length()) | |
179 return; | |
180 String s = sb.toString(); | |
181 int index = 0; | |
182 | |
183 if ((s.endsWith("end") || s.endsWith("ing")) && | |
184 (index = s.length() - 3) >= _R2) { | |
185 sb.delete(index, index + 3); | |
186 if (sb.charAt(index - 2) == 'i' && | |
187 sb.charAt(index - 1) == 'g') { | |
188 if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) { | |
189 index -= 2; | |
190 sb.delete(index, index + 2); | |
191 } | |
192 } else { | |
193 unDouble(sb, index); | |
194 } | |
195 return; | |
196 } | |
197 if (s.endsWith("ig") && | |
198 (index = s.length() - 2) >= _R2 | |
199 ) { | |
200 if (sb.charAt(index - 1) != 'e') | |
201 sb.delete(index, index + 2); | |
202 return; | |
203 } | |
204 if (s.endsWith("lijk") && | |
205 (index = s.length() - 4) >= _R2 | |
206 ) { | |
207 sb.delete(index, index + 4); | |
208 step2(sb); | |
209 return; | |
210 } | |
211 if (s.endsWith("baar") && | |
212 (index = s.length() - 4) >= _R2 | |
213 ) { | |
214 sb.delete(index, index + 4); | |
215 return; | |
216 } | |
217 if (s.endsWith("bar") && | |
218 (index = s.length() - 3) >= _R2 | |
219 ) { | |
220 if (_removedE) | |
221 sb.delete(index, index + 3); | |
222 return; | |
223 } | |
224 } | |
225 | |
226 /** | |
227 * undouble vowel | |
228 * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). | |
229 * | |
230 * @param sb String being stemmed | |
231 */ | |
232 private void step4(StringBuffer sb) { | |
233 if (sb.length() < 4) | |
234 return; | |
235 String end = sb.substring(sb.length() - 4, sb.length()); | |
236 char c = end.charAt(0); | |
237 char v1 = end.charAt(1); | |
238 char v2 = end.charAt(2); | |
239 char d = end.charAt(3); | |
240 if (v1 == v2 && | |
241 d != 'I' && | |
242 v1 != 'i' && | |
243 isVowel(v1) && | |
244 !isVowel(d) && | |
245 !isVowel(c)) { | |
246 sb.delete(sb.length() - 2, sb.length() - 1); | |
247 } | |
248 } | |
249 | |
250 /** | |
251 * Checks if a term could be stemmed. | |
252 * | |
253 * @return true if, and only if, the given term consists in letters. | |
254 */ | |
255 private boolean isStemmable(String term) { | |
256 for (int c = 0; c < term.length(); c++) { | |
257 if (!Character.isLetter(term.charAt(c))) return false; | |
258 } | |
259 return true; | |
260 } | |
261 | |
262 /** | |
263 * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú | |
264 */ | |
265 private void substitute(StringBuffer buffer) { | |
266 for (int i = 0; i < buffer.length(); i++) { | |
267 switch (buffer.charAt(i)) { | |
268 case 'ä': | |
269 case 'á': | |
270 { | |
271 buffer.setCharAt(i, 'a'); | |
272 break; | |
273 } | |
274 case 'ë': | |
275 case 'é': | |
276 { | |
277 buffer.setCharAt(i, 'e'); | |
278 break; | |
279 } | |
280 case 'ü': | |
281 case 'ú': | |
282 { | |
283 buffer.setCharAt(i, 'u'); | |
284 break; | |
285 } | |
286 case 'ï': | |
287 case 'i': | |
288 { | |
289 buffer.setCharAt(i, 'i'); | |
290 break; | |
291 } | |
292 case 'ö': | |
293 case 'ó': | |
294 { | |
295 buffer.setCharAt(i, 'o'); | |
296 break; | |
297 } | |
298 } | |
299 } | |
300 } | |
301 | |
302 /*private boolean isValidSEnding(StringBuffer sb) { | |
303 return isValidSEnding(sb, sb.length() - 1); | |
304 }*/ | |
305 | |
306 private boolean isValidSEnding(StringBuffer sb, int index) { | |
307 char c = sb.charAt(index); | |
308 if (isVowel(c) || c == 'j') | |
309 return false; | |
310 return true; | |
311 } | |
312 | |
313 /*private boolean isValidEnEnding(StringBuffer sb) { | |
314 return isValidEnEnding(sb, sb.length() - 1); | |
315 }*/ | |
316 | |
317 private boolean isValidEnEnding(StringBuffer sb, int index) { | |
318 char c = sb.charAt(index); | |
319 if (isVowel(c)) | |
320 return false; | |
321 if (c < 3) | |
322 return false; | |
323 // ends with "gem"? | |
324 if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e') | |
325 return false; | |
326 return true; | |
327 } | |
328 | |
329 private void unDouble(StringBuffer sb) { | |
330 unDouble(sb, sb.length()); | |
331 } | |
332 | |
333 private void unDouble(StringBuffer sb, int endIndex) { | |
334 String s = sb.substring(0, endIndex); | |
335 if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) { | |
336 sb.delete(endIndex - 1, endIndex); | |
337 } | |
338 } | |
339 | |
340 private int getRIndex(StringBuffer sb, int start) { | |
341 if (start == 0) | |
342 start = 1; | |
343 int i = start; | |
344 for (; i < sb.length(); i++) { | |
345 //first non-vowel preceded by a vowel | |
346 if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) { | |
347 return i + 1; | |
348 } | |
349 } | |
350 return i + 1; | |
351 } | |
352 | |
353 private void storeYandI(StringBuffer sb) { | |
354 if (sb.charAt(0) == 'y') | |
355 sb.setCharAt(0, 'Y'); | |
356 | |
357 int last = sb.length() - 1; | |
358 | |
359 for (int i = 1; i < last; i++) { | |
360 switch (sb.charAt(i)) { | |
361 case 'i': | |
362 { | |
363 if (isVowel(sb.charAt(i - 1)) && | |
364 isVowel(sb.charAt(i + 1)) | |
365 ) | |
366 sb.setCharAt(i, 'I'); | |
367 break; | |
368 } | |
369 case 'y': | |
370 { | |
371 if (isVowel(sb.charAt(i - 1))) | |
372 sb.setCharAt(i, 'Y'); | |
373 break; | |
374 } | |
375 } | |
376 } | |
377 if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1))) | |
378 sb.setCharAt(last, 'Y'); | |
379 } | |
380 | |
381 private void reStoreYandI(StringBuffer sb) { | |
382 String tmp = sb.toString(); | |
383 sb.delete(0, sb.length()); | |
384 sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y")); | |
385 } | |
386 | |
387 private boolean isVowel(char c) { | |
388 switch (c) { | |
389 case 'e': | |
390 case 'a': | |
391 case 'o': | |
392 case 'i': | |
393 case 'u': | |
394 case 'y': | |
395 case 'è': | |
396 { | |
397 return true; | |
398 } | |
399 } | |
400 return false; | |
401 } | |
402 | |
403 void setStemDictionary(Map dict) { | |
404 _stemDict = dict; | |
405 } | |
406 | |
407 } |