Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; | |
2 | |
3 /** | |
4 * Licensed to the Apache Software Foundation (ASF) under one or more | |
5 * contributor license agreements. See the NOTICE file distributed with | |
6 * this work for additional information regarding copyright ownership. | |
7 * The ASF licenses this file to You under the Apache License, Version 2.0 | |
8 * (the "License"); you may not use this file except in compliance with | |
9 * the License. You may obtain a copy of the License at | |
10 * | |
11 * http://www.apache.org/licenses/LICENSE-2.0 | |
12 * | |
13 * Unless required by applicable law or agreed to in writing, software | |
14 * distributed under the License is distributed on an "AS IS" BASIS, | |
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
16 * See the License for the specific language governing permissions and | |
17 * limitations under the License. | |
18 */ | |
19 | |
20 /** | |
21 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). | |
22 * | |
23 * | |
24 * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ | |
25 */ | |
26 public class RussianStemmer | |
27 { | |
28 private char[] charset; | |
29 | |
30 // positions of RV, R1 and R2 respectively | |
31 private int RV, R1, R2; | |
32 | |
33 // letters (currently unused letters are commented out) | |
34 private final static char A = 0; | |
35 //private final static char B = 1; | |
36 private final static char V = 2; | |
37 private final static char G = 3; | |
38 //private final static char D = 4; | |
39 private final static char E = 5; | |
40 //private final static char ZH = 6; | |
41 //private final static char Z = 7; | |
42 private final static char I = 8; | |
43 private final static char I_ = 9; | |
44 //private final static char K = 10; | |
45 private final static char L = 11; | |
46 private final static char M = 12; | |
47 private final static char N = 13; | |
48 private final static char O = 14; | |
49 //private final static char P = 15; | |
50 //private final static char R = 16; | |
51 private final static char S = 17; | |
52 private final static char T = 18; | |
53 private final static char U = 19; | |
54 //private final static char F = 20; | |
55 private final static char X = 21; | |
56 //private final static char TS = 22; | |
57 //private final static char CH = 23; | |
58 private final static char SH = 24; | |
59 private final static char SHCH = 25; | |
60 //private final static char HARD = 26; | |
61 private final static char Y = 27; | |
62 private final static char SOFT = 28; | |
63 private final static char AE = 29; | |
64 private final static char IU = 30; | |
65 private final static char IA = 31; | |
66 | |
67 // stem definitions | |
68 private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; | |
69 | |
70 private static char[][] perfectiveGerundEndings1 = { | |
71 { V }, | |
72 { V, SH, I }, | |
73 { V, SH, I, S, SOFT } | |
74 }; | |
75 | |
76 private static char[][] perfectiveGerund1Predessors = { | |
77 { A }, | |
78 { IA } | |
79 }; | |
80 | |
81 private static char[][] perfectiveGerundEndings2 = { { I, V }, { | |
82 Y, V }, { | |
83 I, V, SH, I }, { | |
84 Y, V, SH, I }, { | |
85 I, V, SH, I, S, SOFT }, { | |
86 Y, V, SH, I, S, SOFT } | |
87 }; | |
88 | |
89 private static char[][] adjectiveEndings = { | |
90 { E, E }, | |
91 { I, E }, | |
92 { Y, E }, | |
93 { O, E }, | |
94 { E, I_ }, | |
95 { I, I_ }, | |
96 { Y, I_ }, | |
97 { O, I_ }, | |
98 { E, M }, | |
99 { I, M }, | |
100 { Y, M }, | |
101 { O, M }, | |
102 { I, X }, | |
103 { Y, X }, | |
104 { U, IU }, | |
105 { IU, IU }, | |
106 { A, IA }, | |
107 { IA, IA }, | |
108 { O, IU }, | |
109 { E, IU }, | |
110 { I, M, I }, | |
111 { Y, M, I }, | |
112 { E, G, O }, | |
113 { O, G, O }, | |
114 { E, M, U }, | |
115 {O, M, U } | |
116 }; | |
117 | |
118 private static char[][] participleEndings1 = { | |
119 { SHCH }, | |
120 { E, M }, | |
121 { N, N }, | |
122 { V, SH }, | |
123 { IU, SHCH } | |
124 }; | |
125 | |
126 private static char[][] participleEndings2 = { | |
127 { I, V, SH }, | |
128 { Y, V, SH }, | |
129 { U, IU, SHCH } | |
130 }; | |
131 | |
132 private static char[][] participle1Predessors = { | |
133 { A }, | |
134 { IA } | |
135 }; | |
136 | |
137 private static char[][] reflexiveEndings = { | |
138 { S, IA }, | |
139 { S, SOFT } | |
140 }; | |
141 | |
142 private static char[][] verbEndings1 = { | |
143 { I_ }, | |
144 { L }, | |
145 { N }, | |
146 { L, O }, | |
147 { N, O }, | |
148 { E, T }, | |
149 { IU, T }, | |
150 { L, A }, | |
151 { N, A }, | |
152 { L, I }, | |
153 { E, M }, | |
154 { N, Y }, | |
155 { E, T, E }, | |
156 { I_, T, E }, | |
157 { T, SOFT }, | |
158 { E, SH, SOFT }, | |
159 { N, N, O } | |
160 }; | |
161 | |
162 private static char[][] verbEndings2 = { | |
163 { IU }, | |
164 { U, IU }, | |
165 { E, N }, | |
166 { E, I_ }, | |
167 { IA, T }, | |
168 { U, I_ }, | |
169 { I, L }, | |
170 { Y, L }, | |
171 { I, M }, | |
172 { Y, M }, | |
173 { I, T }, | |
174 { Y, T }, | |
175 { I, L, A }, | |
176 { Y, L, A }, | |
177 { E, N, A }, | |
178 { I, T, E }, | |
179 { I, L, I }, | |
180 { Y, L, I }, | |
181 { I, L, O }, | |
182 { Y, L, O }, | |
183 { E, N, O }, | |
184 { U, E, T }, | |
185 { U, IU, T }, | |
186 { E, N, Y }, | |
187 { I, T, SOFT }, | |
188 { Y, T, SOFT }, | |
189 { I, SH, SOFT }, | |
190 { E, I_, T, E }, | |
191 { U, I_, T, E } | |
192 }; | |
193 | |
194 private static char[][] verb1Predessors = { | |
195 { A }, | |
196 { IA } | |
197 }; | |
198 | |
199 private static char[][] nounEndings = { | |
200 { A }, | |
201 { U }, | |
202 { I_ }, | |
203 { O }, | |
204 { U }, | |
205 { E }, | |
206 { Y }, | |
207 { I }, | |
208 { SOFT }, | |
209 { IA }, | |
210 { E, V }, | |
211 { O, V }, | |
212 { I, E }, | |
213 { SOFT, E }, | |
214 { IA, X }, | |
215 { I, IU }, | |
216 { E, I }, | |
217 { I, I }, | |
218 { E, I_ }, | |
219 { O, I_ }, | |
220 { E, M }, | |
221 { A, M }, | |
222 { O, M }, | |
223 { A, X }, | |
224 { SOFT, IU }, | |
225 { I, IA }, | |
226 { SOFT, IA }, | |
227 { I, I_ }, | |
228 { IA, M }, | |
229 { IA, M, I }, | |
230 { A, M, I }, | |
231 { I, E, I_ }, | |
232 { I, IA, M }, | |
233 { I, E, M }, | |
234 { I, IA, X }, | |
235 { I, IA, M, I } | |
236 }; | |
237 | |
238 private static char[][] superlativeEndings = { | |
239 { E, I_, SH }, | |
240 { E, I_, SH, E } | |
241 }; | |
242 | |
243 private static char[][] derivationalEndings = { | |
244 { O, S, T }, | |
245 { O, S, T, SOFT } | |
246 }; | |
247 | |
248 /** | |
249 * RussianStemmer constructor comment. | |
250 */ | |
251 public RussianStemmer() | |
252 { | |
253 super(); | |
254 } | |
255 | |
256 /** | |
257 * RussianStemmer constructor comment. | |
258 */ | |
259 public RussianStemmer(char[] charset) | |
260 { | |
261 super(); | |
262 this.charset = charset; | |
263 } | |
264 | |
265 /** | |
266 * Adjectival ending is an adjective ending, | |
267 * optionally preceded by participle ending. | |
268 * Creation date: (17/03/2002 12:14:58 AM) | |
269 * @param stemmingZone java.lang.StringBuffer | |
270 */ | |
271 private boolean adjectival(StringBuffer stemmingZone) | |
272 { | |
273 // look for adjective ending in a stemming zone | |
274 if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) | |
275 return false; | |
276 // if adjective ending was found, try for participle ending. | |
277 // variable r is unused, we are just interested in the side effect of | |
278 // findAndRemoveEnding(): | |
279 boolean r = | |
280 findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) | |
281 || | |
282 findAndRemoveEnding(stemmingZone, participleEndings2); | |
283 return true; | |
284 } | |
285 | |
286 /** | |
287 * Derivational endings | |
288 * Creation date: (17/03/2002 12:14:58 AM) | |
289 * @param stemmingZone java.lang.StringBuffer | |
290 */ | |
291 private boolean derivational(StringBuffer stemmingZone) | |
292 { | |
293 int endingLength = findEnding(stemmingZone, derivationalEndings); | |
294 if (endingLength == 0) | |
295 // no derivational ending found | |
296 return false; | |
297 else | |
298 { | |
299 // Ensure that the ending locates in R2 | |
300 if (R2 - RV <= stemmingZone.length() - endingLength) | |
301 { | |
302 stemmingZone.setLength(stemmingZone.length() - endingLength); | |
303 return true; | |
304 } | |
305 else | |
306 { | |
307 return false; | |
308 } | |
309 } | |
310 } | |
311 | |
312 /** | |
313 * Finds ending among given ending class and returns the length of ending found(0, if not found). | |
314 * Creation date: (17/03/2002 8:18:34 PM) | |
315 */ | |
316 private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) | |
317 { | |
318 boolean match = false; | |
319 for (int i = theEndingClass.length - 1; i >= 0; i--) | |
320 { | |
321 char[] theEnding = theEndingClass[i]; | |
322 // check if the ending is bigger than stemming zone | |
323 if (startIndex < theEnding.length - 1) | |
324 { | |
325 match = false; | |
326 continue; | |
327 } | |
328 match = true; | |
329 int stemmingIndex = startIndex; | |
330 for (int j = theEnding.length - 1; j >= 0; j--) | |
331 { | |
332 if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) | |
333 { | |
334 match = false; | |
335 break; | |
336 } | |
337 } | |
338 // check if ending was found | |
339 if (match) | |
340 { | |
341 return theEndingClass[i].length; // cut ending | |
342 } | |
343 } | |
344 return 0; | |
345 } | |
346 | |
347 private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) | |
348 { | |
349 return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); | |
350 } | |
351 | |
352 /** | |
353 * Finds the ending among the given class of endings and removes it from stemming zone. | |
354 * Creation date: (17/03/2002 8:18:34 PM) | |
355 */ | |
356 private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) | |
357 { | |
358 int endingLength = findEnding(stemmingZone, theEndingClass); | |
359 if (endingLength == 0) | |
360 // not found | |
361 return false; | |
362 else { | |
363 stemmingZone.setLength(stemmingZone.length() - endingLength); | |
364 // cut the ending found | |
365 return true; | |
366 } | |
367 } | |
368 | |
369 /** | |
370 * Finds the ending among the given class of endings, then checks if this ending was | |
371 * preceded by any of given predessors, and if so, removes it from stemming zone. | |
372 * Creation date: (17/03/2002 8:18:34 PM) | |
373 */ | |
374 private boolean findAndRemoveEnding(StringBuffer stemmingZone, | |
375 char[][] theEndingClass, char[][] thePredessors) | |
376 { | |
377 int endingLength = findEnding(stemmingZone, theEndingClass); | |
378 if (endingLength == 0) | |
379 // not found | |
380 return false; | |
381 else | |
382 { | |
383 int predessorLength = | |
384 findEnding(stemmingZone, | |
385 stemmingZone.length() - endingLength - 1, | |
386 thePredessors); | |
387 if (predessorLength == 0) | |
388 return false; | |
389 else { | |
390 stemmingZone.setLength(stemmingZone.length() - endingLength); | |
391 // cut the ending found | |
392 return true; | |
393 } | |
394 } | |
395 | |
396 } | |
397 | |
398 /** | |
399 * Marks positions of RV, R1 and R2 in a given word. | |
400 * Creation date: (16/03/2002 3:40:11 PM) | |
401 */ | |
402 private void markPositions(String word) | |
403 { | |
404 RV = 0; | |
405 R1 = 0; | |
406 R2 = 0; | |
407 int i = 0; | |
408 // find RV | |
409 while (word.length() > i && !isVowel(word.charAt(i))) | |
410 { | |
411 i++; | |
412 } | |
413 if (word.length() - 1 < ++i) | |
414 return; // RV zone is empty | |
415 RV = i; | |
416 // find R1 | |
417 while (word.length() > i && isVowel(word.charAt(i))) | |
418 { | |
419 i++; | |
420 } | |
421 if (word.length() - 1 < ++i) | |
422 return; // R1 zone is empty | |
423 R1 = i; | |
424 // find R2 | |
425 while (word.length() > i && !isVowel(word.charAt(i))) | |
426 { | |
427 i++; | |
428 } | |
429 if (word.length() - 1 < ++i) | |
430 return; // R2 zone is empty | |
431 while (word.length() > i && isVowel(word.charAt(i))) | |
432 { | |
433 i++; | |
434 } | |
435 if (word.length() - 1 < ++i) | |
436 return; // R2 zone is empty | |
437 R2 = i; | |
438 } | |
439 | |
440 /** | |
441 * Checks if character is a vowel.. | |
442 * Creation date: (16/03/2002 10:47:03 PM) | |
443 * @return boolean | |
444 * @param letter char | |
445 */ | |
446 private boolean isVowel(char letter) | |
447 { | |
448 for (int i = 0; i < vowels.length; i++) | |
449 { | |
450 if (letter == charset[vowels[i]]) | |
451 return true; | |
452 } | |
453 return false; | |
454 } | |
455 | |
456 /** | |
457 * Noun endings. | |
458 * Creation date: (17/03/2002 12:14:58 AM) | |
459 * @param stemmingZone java.lang.StringBuffer | |
460 */ | |
461 private boolean noun(StringBuffer stemmingZone) | |
462 { | |
463 return findAndRemoveEnding(stemmingZone, nounEndings); | |
464 } | |
465 | |
466 /** | |
467 * Perfective gerund endings. | |
468 * Creation date: (17/03/2002 12:14:58 AM) | |
469 * @param stemmingZone java.lang.StringBuffer | |
470 */ | |
471 private boolean perfectiveGerund(StringBuffer stemmingZone) | |
472 { | |
473 return findAndRemoveEnding( | |
474 stemmingZone, | |
475 perfectiveGerundEndings1, | |
476 perfectiveGerund1Predessors) | |
477 || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); | |
478 } | |
479 | |
480 /** | |
481 * Reflexive endings. | |
482 * Creation date: (17/03/2002 12:14:58 AM) | |
483 * @param stemmingZone java.lang.StringBuffer | |
484 */ | |
485 private boolean reflexive(StringBuffer stemmingZone) | |
486 { | |
487 return findAndRemoveEnding(stemmingZone, reflexiveEndings); | |
488 } | |
489 | |
490 /** | |
491 * Insert the method's description here. | |
492 * Creation date: (17/03/2002 12:14:58 AM) | |
493 * @param stemmingZone java.lang.StringBuffer | |
494 */ | |
495 private boolean removeI(StringBuffer stemmingZone) | |
496 { | |
497 if (stemmingZone.length() > 0 | |
498 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) | |
499 { | |
500 stemmingZone.setLength(stemmingZone.length() - 1); | |
501 return true; | |
502 } | |
503 else | |
504 { | |
505 return false; | |
506 } | |
507 } | |
508 | |
509 /** | |
510 * Insert the method's description here. | |
511 * Creation date: (17/03/2002 12:14:58 AM) | |
512 * @param stemmingZone java.lang.StringBuffer | |
513 */ | |
514 private boolean removeSoft(StringBuffer stemmingZone) | |
515 { | |
516 if (stemmingZone.length() > 0 | |
517 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) | |
518 { | |
519 stemmingZone.setLength(stemmingZone.length() - 1); | |
520 return true; | |
521 } | |
522 else | |
523 { | |
524 return false; | |
525 } | |
526 } | |
527 | |
528 /** | |
529 * Insert the method's description here. | |
530 * Creation date: (16/03/2002 10:58:42 PM) | |
531 * @param newCharset char[] | |
532 */ | |
533 public void setCharset(char[] newCharset) | |
534 { | |
535 charset = newCharset; | |
536 } | |
537 | |
538 /** | |
539 * Finds the stem for given Russian word. | |
540 * Creation date: (16/03/2002 3:36:48 PM) | |
541 * @return java.lang.String | |
542 * @param input java.lang.String | |
543 */ | |
544 public String stem(String input) | |
545 { | |
546 markPositions(input); | |
547 if (RV == 0) | |
548 return input; //RV wasn't detected, nothing to stem | |
549 StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); | |
550 // stemming goes on in RV | |
551 // Step 1 | |
552 | |
553 if (!perfectiveGerund(stemmingZone)) | |
554 { | |
555 reflexive(stemmingZone); | |
556 // variable r is unused, we are just interested in the flow that gets | |
557 // created by logical expression: apply adjectival(); if that fails, | |
558 // apply verb() etc | |
559 boolean r = | |
560 adjectival(stemmingZone) | |
561 || verb(stemmingZone) | |
562 || noun(stemmingZone); | |
563 } | |
564 // Step 2 | |
565 removeI(stemmingZone); | |
566 // Step 3 | |
567 derivational(stemmingZone); | |
568 // Step 4 | |
569 superlative(stemmingZone); | |
570 undoubleN(stemmingZone); | |
571 removeSoft(stemmingZone); | |
572 // return result | |
573 return input.substring(0, RV) + stemmingZone.toString(); | |
574 } | |
575 | |
576 /** | |
577 * Superlative endings. | |
578 * Creation date: (17/03/2002 12:14:58 AM) | |
579 * @param stemmingZone java.lang.StringBuffer | |
580 */ | |
581 private boolean superlative(StringBuffer stemmingZone) | |
582 { | |
583 return findAndRemoveEnding(stemmingZone, superlativeEndings); | |
584 } | |
585 | |
586 /** | |
587 * Undoubles N. | |
588 * Creation date: (17/03/2002 12:14:58 AM) | |
589 * @param stemmingZone java.lang.StringBuffer | |
590 */ | |
591 private boolean undoubleN(StringBuffer stemmingZone) | |
592 { | |
593 char[][] doubleN = { | |
594 { N, N } | |
595 }; | |
596 if (findEnding(stemmingZone, doubleN) != 0) | |
597 { | |
598 stemmingZone.setLength(stemmingZone.length() - 1); | |
599 return true; | |
600 } | |
601 else | |
602 { | |
603 return false; | |
604 } | |
605 } | |
606 | |
607 /** | |
608 * Verb endings. | |
609 * Creation date: (17/03/2002 12:14:58 AM) | |
610 * @param stemmingZone java.lang.StringBuffer | |
611 */ | |
612 private boolean verb(StringBuffer stemmingZone) | |
613 { | |
614 return findAndRemoveEnding( | |
615 stemmingZone, | |
616 verbEndings1, | |
617 verb1Predessors) | |
618 || findAndRemoveEnding(stemmingZone, verbEndings2); | |
619 } | |
620 | |
621 /** | |
622 * Static method for stemming with different charsets | |
623 */ | |
624 public static String stem(String theWord, char[] charset) | |
625 { | |
626 RussianStemmer stemmer = new RussianStemmer(); | |
627 stemmer.setCharset(charset); | |
628 return stemmer.stem(theWord); | |
629 } | |
630 } |