comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/DutchStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import java.util.Map;
21
22 /**
23 *
24 * A stemmer for Dutch words. The algorithm is an implementation of
25 * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
26 * algorithm in Martin Porter's snowball project.
27 *
28 * @author Edwin de Jonge (ejne at cbs.nl)
29 */
30
31 public class DutchStemmer {
32 /**
33 * Buffer for the terms while stemming them.
34 */
35 private StringBuffer sb = new StringBuffer();
36 private boolean _removedE;
37 private Map _stemDict;
38
39 private int _R1;
40 private int _R2;
41
42 //TODO convert to internal
43 /*
44 * Stemms the given term to an unique <tt>discriminator</tt>.
45 *
46 * @param term The term that should be stemmed.
47 * @return Discriminator for <tt>term</tt>
48 */
49 public String stem(String term) {
50 term = term.toLowerCase();
51 if (!isStemmable(term))
52 return term;
53 if (_stemDict != null && _stemDict.containsKey(term))
54 if (_stemDict.get(term) instanceof String)
55 return (String) _stemDict.get(term);
56 else
57 return null;
58
59 // Reset the StringBuffer.
60 sb.delete(0, sb.length());
61 sb.insert(0, term);
62 // Stemming starts here...
63 substitute(sb);
64 storeYandI(sb);
65 _R1 = getRIndex(sb, 0);
66 _R1 = Math.max(3, _R1);
67 step1(sb);
68 step2(sb);
69 _R2 = getRIndex(sb, _R1);
70 step3a(sb);
71 step3b(sb);
72 step4(sb);
73 reStoreYandI(sb);
74 return sb.toString();
75 }
76
77 private boolean enEnding(StringBuffer sb) {
78 String[] enend = new String[]{"ene", "en"};
79 for (int i = 0; i < enend.length; i++) {
80 String end = enend[i];
81 String s = sb.toString();
82 int index = s.length() - end.length();
83 if (s.endsWith(end) &&
84 index >= _R1 &&
85 isValidEnEnding(sb, index - 1)
86 ) {
87 sb.delete(index, index + end.length());
88 unDouble(sb, index);
89 return true;
90 }
91 }
92 return false;
93 }
94
95
96 private void step1(StringBuffer sb) {
97 if (_R1 >= sb.length())
98 return;
99
100 String s = sb.toString();
101 int lengthR1 = sb.length() - _R1;
102 int index;
103
104 if (s.endsWith("heden")) {
105 sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
106 return;
107 }
108
109 if (enEnding(sb))
110 return;
111
112 if (s.endsWith("se") &&
113 (index = s.length() - 2) >= _R1 &&
114 isValidSEnding(sb, index - 1)
115 ) {
116 sb.delete(index, index + 2);
117 return;
118 }
119 if (s.endsWith("s") &&
120 (index = s.length() - 1) >= _R1 &&
121 isValidSEnding(sb, index - 1)) {
122 sb.delete(index, index + 1);
123 }
124 }
125
126 /**
127 * Delete suffix e if in R1 and
128 * preceded by a non-vowel, and then undouble the ending
129 *
130 * @param sb String being stemmed
131 */
132 private void step2(StringBuffer sb) {
133 _removedE = false;
134 if (_R1 >= sb.length())
135 return;
136 String s = sb.toString();
137 int index = s.length() - 1;
138 if (index >= _R1 &&
139 s.endsWith("e") &&
140 !isVowel(sb.charAt(index - 1))) {
141 sb.delete(index, index + 1);
142 unDouble(sb);
143 _removedE = true;
144 }
145 }
146
147 /**
148 * Delete "heid"
149 *
150 * @param sb String being stemmed
151 */
152 private void step3a(StringBuffer sb) {
153 if (_R2 >= sb.length())
154 return;
155 String s = sb.toString();
156 int index = s.length() - 4;
157 if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
158 sb.delete(index, index + 4); //remove heid
159 enEnding(sb);
160 }
161 }
162
163 /**
164 * <p>A d-suffix, or derivational suffix, enables a new word,
165 * often with a different grammatical category, or with a different
166 * sense, to be built from another word. Whether a d-suffix can be
167 * attached is discovered not from the rules of grammar, but by
168 * referring to a dictionary. So in English, ness can be added to
169 * certain adjectives to form corresponding nouns (littleness,
170 * kindness, foolishness ...) but not to all adjectives
171 * (not for example, to big, cruel, wise ...) d-suffixes can be
172 * used to change meaning, often in rather exotic ways.</p>
173 * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
174 *
175 * @param sb String being stemmed
176 */
177 private void step3b(StringBuffer sb) {
178 if (_R2 >= sb.length())
179 return;
180 String s = sb.toString();
181 int index = 0;
182
183 if ((s.endsWith("end") || s.endsWith("ing")) &&
184 (index = s.length() - 3) >= _R2) {
185 sb.delete(index, index + 3);
186 if (sb.charAt(index - 2) == 'i' &&
187 sb.charAt(index - 1) == 'g') {
188 if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
189 index -= 2;
190 sb.delete(index, index + 2);
191 }
192 } else {
193 unDouble(sb, index);
194 }
195 return;
196 }
197 if (s.endsWith("ig") &&
198 (index = s.length() - 2) >= _R2
199 ) {
200 if (sb.charAt(index - 1) != 'e')
201 sb.delete(index, index + 2);
202 return;
203 }
204 if (s.endsWith("lijk") &&
205 (index = s.length() - 4) >= _R2
206 ) {
207 sb.delete(index, index + 4);
208 step2(sb);
209 return;
210 }
211 if (s.endsWith("baar") &&
212 (index = s.length() - 4) >= _R2
213 ) {
214 sb.delete(index, index + 4);
215 return;
216 }
217 if (s.endsWith("bar") &&
218 (index = s.length() - 3) >= _R2
219 ) {
220 if (_removedE)
221 sb.delete(index, index + 3);
222 return;
223 }
224 }
225
226 /**
227 * undouble vowel
228 * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
229 *
230 * @param sb String being stemmed
231 */
232 private void step4(StringBuffer sb) {
233 if (sb.length() < 4)
234 return;
235 String end = sb.substring(sb.length() - 4, sb.length());
236 char c = end.charAt(0);
237 char v1 = end.charAt(1);
238 char v2 = end.charAt(2);
239 char d = end.charAt(3);
240 if (v1 == v2 &&
241 d != 'I' &&
242 v1 != 'i' &&
243 isVowel(v1) &&
244 !isVowel(d) &&
245 !isVowel(c)) {
246 sb.delete(sb.length() - 2, sb.length() - 1);
247 }
248 }
249
250 /**
251 * Checks if a term could be stemmed.
252 *
253 * @return true if, and only if, the given term consists in letters.
254 */
255 private boolean isStemmable(String term) {
256 for (int c = 0; c < term.length(); c++) {
257 if (!Character.isLetter(term.charAt(c))) return false;
258 }
259 return true;
260 }
261
262 /**
263 * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
264 */
265 private void substitute(StringBuffer buffer) {
266 for (int i = 0; i < buffer.length(); i++) {
267 switch (buffer.charAt(i)) {
268 case 'ä':
269 case 'á':
270 {
271 buffer.setCharAt(i, 'a');
272 break;
273 }
274 case 'ë':
275 case 'é':
276 {
277 buffer.setCharAt(i, 'e');
278 break;
279 }
280 case 'ü':
281 case 'ú':
282 {
283 buffer.setCharAt(i, 'u');
284 break;
285 }
286 case 'ï':
287 case 'i':
288 {
289 buffer.setCharAt(i, 'i');
290 break;
291 }
292 case 'ö':
293 case 'ó':
294 {
295 buffer.setCharAt(i, 'o');
296 break;
297 }
298 }
299 }
300 }
301
302 /*private boolean isValidSEnding(StringBuffer sb) {
303 return isValidSEnding(sb, sb.length() - 1);
304 }*/
305
306 private boolean isValidSEnding(StringBuffer sb, int index) {
307 char c = sb.charAt(index);
308 if (isVowel(c) || c == 'j')
309 return false;
310 return true;
311 }
312
313 /*private boolean isValidEnEnding(StringBuffer sb) {
314 return isValidEnEnding(sb, sb.length() - 1);
315 }*/
316
317 private boolean isValidEnEnding(StringBuffer sb, int index) {
318 char c = sb.charAt(index);
319 if (isVowel(c))
320 return false;
321 if (c < 3)
322 return false;
323 // ends with "gem"?
324 if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
325 return false;
326 return true;
327 }
328
329 private void unDouble(StringBuffer sb) {
330 unDouble(sb, sb.length());
331 }
332
333 private void unDouble(StringBuffer sb, int endIndex) {
334 String s = sb.substring(0, endIndex);
335 if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
336 sb.delete(endIndex - 1, endIndex);
337 }
338 }
339
340 private int getRIndex(StringBuffer sb, int start) {
341 if (start == 0)
342 start = 1;
343 int i = start;
344 for (; i < sb.length(); i++) {
345 //first non-vowel preceded by a vowel
346 if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
347 return i + 1;
348 }
349 }
350 return i + 1;
351 }
352
353 private void storeYandI(StringBuffer sb) {
354 if (sb.charAt(0) == 'y')
355 sb.setCharAt(0, 'Y');
356
357 int last = sb.length() - 1;
358
359 for (int i = 1; i < last; i++) {
360 switch (sb.charAt(i)) {
361 case 'i':
362 {
363 if (isVowel(sb.charAt(i - 1)) &&
364 isVowel(sb.charAt(i + 1))
365 )
366 sb.setCharAt(i, 'I');
367 break;
368 }
369 case 'y':
370 {
371 if (isVowel(sb.charAt(i - 1)))
372 sb.setCharAt(i, 'Y');
373 break;
374 }
375 }
376 }
377 if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
378 sb.setCharAt(last, 'Y');
379 }
380
381 private void reStoreYandI(StringBuffer sb) {
382 String tmp = sb.toString();
383 sb.delete(0, sb.length());
384 sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
385 }
386
387 private boolean isVowel(char c) {
388 switch (c) {
389 case 'e':
390 case 'a':
391 case 'o':
392 case 'i':
393 case 'u':
394 case 'y':
395 case 'è':
396 {
397 return true;
398 }
399 }
400 return false;
401 }
402
403 void setStemDictionary(Map dict) {
404 _stemDict = dict;
405 }
406
407 }