comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 /**
21 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
22 *
23 *
24 * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $
25 */
26 public class RussianStemmer
27 {
28 private char[] charset;
29
30 // positions of RV, R1 and R2 respectively
31 private int RV, R1, R2;
32
33 // letters (currently unused letters are commented out)
34 private final static char A = 0;
35 //private final static char B = 1;
36 private final static char V = 2;
37 private final static char G = 3;
38 //private final static char D = 4;
39 private final static char E = 5;
40 //private final static char ZH = 6;
41 //private final static char Z = 7;
42 private final static char I = 8;
43 private final static char I_ = 9;
44 //private final static char K = 10;
45 private final static char L = 11;
46 private final static char M = 12;
47 private final static char N = 13;
48 private final static char O = 14;
49 //private final static char P = 15;
50 //private final static char R = 16;
51 private final static char S = 17;
52 private final static char T = 18;
53 private final static char U = 19;
54 //private final static char F = 20;
55 private final static char X = 21;
56 //private final static char TS = 22;
57 //private final static char CH = 23;
58 private final static char SH = 24;
59 private final static char SHCH = 25;
60 //private final static char HARD = 26;
61 private final static char Y = 27;
62 private final static char SOFT = 28;
63 private final static char AE = 29;
64 private final static char IU = 30;
65 private final static char IA = 31;
66
67 // stem definitions
68 private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
69
70 private static char[][] perfectiveGerundEndings1 = {
71 { V },
72 { V, SH, I },
73 { V, SH, I, S, SOFT }
74 };
75
76 private static char[][] perfectiveGerund1Predessors = {
77 { A },
78 { IA }
79 };
80
81 private static char[][] perfectiveGerundEndings2 = { { I, V }, {
82 Y, V }, {
83 I, V, SH, I }, {
84 Y, V, SH, I }, {
85 I, V, SH, I, S, SOFT }, {
86 Y, V, SH, I, S, SOFT }
87 };
88
89 private static char[][] adjectiveEndings = {
90 { E, E },
91 { I, E },
92 { Y, E },
93 { O, E },
94 { E, I_ },
95 { I, I_ },
96 { Y, I_ },
97 { O, I_ },
98 { E, M },
99 { I, M },
100 { Y, M },
101 { O, M },
102 { I, X },
103 { Y, X },
104 { U, IU },
105 { IU, IU },
106 { A, IA },
107 { IA, IA },
108 { O, IU },
109 { E, IU },
110 { I, M, I },
111 { Y, M, I },
112 { E, G, O },
113 { O, G, O },
114 { E, M, U },
115 {O, M, U }
116 };
117
118 private static char[][] participleEndings1 = {
119 { SHCH },
120 { E, M },
121 { N, N },
122 { V, SH },
123 { IU, SHCH }
124 };
125
126 private static char[][] participleEndings2 = {
127 { I, V, SH },
128 { Y, V, SH },
129 { U, IU, SHCH }
130 };
131
132 private static char[][] participle1Predessors = {
133 { A },
134 { IA }
135 };
136
137 private static char[][] reflexiveEndings = {
138 { S, IA },
139 { S, SOFT }
140 };
141
142 private static char[][] verbEndings1 = {
143 { I_ },
144 { L },
145 { N },
146 { L, O },
147 { N, O },
148 { E, T },
149 { IU, T },
150 { L, A },
151 { N, A },
152 { L, I },
153 { E, M },
154 { N, Y },
155 { E, T, E },
156 { I_, T, E },
157 { T, SOFT },
158 { E, SH, SOFT },
159 { N, N, O }
160 };
161
162 private static char[][] verbEndings2 = {
163 { IU },
164 { U, IU },
165 { E, N },
166 { E, I_ },
167 { IA, T },
168 { U, I_ },
169 { I, L },
170 { Y, L },
171 { I, M },
172 { Y, M },
173 { I, T },
174 { Y, T },
175 { I, L, A },
176 { Y, L, A },
177 { E, N, A },
178 { I, T, E },
179 { I, L, I },
180 { Y, L, I },
181 { I, L, O },
182 { Y, L, O },
183 { E, N, O },
184 { U, E, T },
185 { U, IU, T },
186 { E, N, Y },
187 { I, T, SOFT },
188 { Y, T, SOFT },
189 { I, SH, SOFT },
190 { E, I_, T, E },
191 { U, I_, T, E }
192 };
193
194 private static char[][] verb1Predessors = {
195 { A },
196 { IA }
197 };
198
199 private static char[][] nounEndings = {
200 { A },
201 { U },
202 { I_ },
203 { O },
204 { U },
205 { E },
206 { Y },
207 { I },
208 { SOFT },
209 { IA },
210 { E, V },
211 { O, V },
212 { I, E },
213 { SOFT, E },
214 { IA, X },
215 { I, IU },
216 { E, I },
217 { I, I },
218 { E, I_ },
219 { O, I_ },
220 { E, M },
221 { A, M },
222 { O, M },
223 { A, X },
224 { SOFT, IU },
225 { I, IA },
226 { SOFT, IA },
227 { I, I_ },
228 { IA, M },
229 { IA, M, I },
230 { A, M, I },
231 { I, E, I_ },
232 { I, IA, M },
233 { I, E, M },
234 { I, IA, X },
235 { I, IA, M, I }
236 };
237
238 private static char[][] superlativeEndings = {
239 { E, I_, SH },
240 { E, I_, SH, E }
241 };
242
243 private static char[][] derivationalEndings = {
244 { O, S, T },
245 { O, S, T, SOFT }
246 };
247
248 /**
249 * RussianStemmer constructor comment.
250 */
251 public RussianStemmer()
252 {
253 super();
254 }
255
256 /**
257 * RussianStemmer constructor comment.
258 */
259 public RussianStemmer(char[] charset)
260 {
261 super();
262 this.charset = charset;
263 }
264
265 /**
266 * Adjectival ending is an adjective ending,
267 * optionally preceded by participle ending.
268 * Creation date: (17/03/2002 12:14:58 AM)
269 * @param stemmingZone java.lang.StringBuffer
270 */
271 private boolean adjectival(StringBuffer stemmingZone)
272 {
273 // look for adjective ending in a stemming zone
274 if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
275 return false;
276 // if adjective ending was found, try for participle ending.
277 // variable r is unused, we are just interested in the side effect of
278 // findAndRemoveEnding():
279 boolean r =
280 findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
281 ||
282 findAndRemoveEnding(stemmingZone, participleEndings2);
283 return true;
284 }
285
286 /**
287 * Derivational endings
288 * Creation date: (17/03/2002 12:14:58 AM)
289 * @param stemmingZone java.lang.StringBuffer
290 */
291 private boolean derivational(StringBuffer stemmingZone)
292 {
293 int endingLength = findEnding(stemmingZone, derivationalEndings);
294 if (endingLength == 0)
295 // no derivational ending found
296 return false;
297 else
298 {
299 // Ensure that the ending locates in R2
300 if (R2 - RV <= stemmingZone.length() - endingLength)
301 {
302 stemmingZone.setLength(stemmingZone.length() - endingLength);
303 return true;
304 }
305 else
306 {
307 return false;
308 }
309 }
310 }
311
312 /**
313 * Finds ending among given ending class and returns the length of ending found(0, if not found).
314 * Creation date: (17/03/2002 8:18:34 PM)
315 */
316 private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
317 {
318 boolean match = false;
319 for (int i = theEndingClass.length - 1; i >= 0; i--)
320 {
321 char[] theEnding = theEndingClass[i];
322 // check if the ending is bigger than stemming zone
323 if (startIndex < theEnding.length - 1)
324 {
325 match = false;
326 continue;
327 }
328 match = true;
329 int stemmingIndex = startIndex;
330 for (int j = theEnding.length - 1; j >= 0; j--)
331 {
332 if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
333 {
334 match = false;
335 break;
336 }
337 }
338 // check if ending was found
339 if (match)
340 {
341 return theEndingClass[i].length; // cut ending
342 }
343 }
344 return 0;
345 }
346
347 private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
348 {
349 return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
350 }
351
352 /**
353 * Finds the ending among the given class of endings and removes it from stemming zone.
354 * Creation date: (17/03/2002 8:18:34 PM)
355 */
356 private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
357 {
358 int endingLength = findEnding(stemmingZone, theEndingClass);
359 if (endingLength == 0)
360 // not found
361 return false;
362 else {
363 stemmingZone.setLength(stemmingZone.length() - endingLength);
364 // cut the ending found
365 return true;
366 }
367 }
368
369 /**
370 * Finds the ending among the given class of endings, then checks if this ending was
371 * preceded by any of given predessors, and if so, removes it from stemming zone.
372 * Creation date: (17/03/2002 8:18:34 PM)
373 */
374 private boolean findAndRemoveEnding(StringBuffer stemmingZone,
375 char[][] theEndingClass, char[][] thePredessors)
376 {
377 int endingLength = findEnding(stemmingZone, theEndingClass);
378 if (endingLength == 0)
379 // not found
380 return false;
381 else
382 {
383 int predessorLength =
384 findEnding(stemmingZone,
385 stemmingZone.length() - endingLength - 1,
386 thePredessors);
387 if (predessorLength == 0)
388 return false;
389 else {
390 stemmingZone.setLength(stemmingZone.length() - endingLength);
391 // cut the ending found
392 return true;
393 }
394 }
395
396 }
397
398 /**
399 * Marks positions of RV, R1 and R2 in a given word.
400 * Creation date: (16/03/2002 3:40:11 PM)
401 */
402 private void markPositions(String word)
403 {
404 RV = 0;
405 R1 = 0;
406 R2 = 0;
407 int i = 0;
408 // find RV
409 while (word.length() > i && !isVowel(word.charAt(i)))
410 {
411 i++;
412 }
413 if (word.length() - 1 < ++i)
414 return; // RV zone is empty
415 RV = i;
416 // find R1
417 while (word.length() > i && isVowel(word.charAt(i)))
418 {
419 i++;
420 }
421 if (word.length() - 1 < ++i)
422 return; // R1 zone is empty
423 R1 = i;
424 // find R2
425 while (word.length() > i && !isVowel(word.charAt(i)))
426 {
427 i++;
428 }
429 if (word.length() - 1 < ++i)
430 return; // R2 zone is empty
431 while (word.length() > i && isVowel(word.charAt(i)))
432 {
433 i++;
434 }
435 if (word.length() - 1 < ++i)
436 return; // R2 zone is empty
437 R2 = i;
438 }
439
440 /**
441 * Checks if character is a vowel..
442 * Creation date: (16/03/2002 10:47:03 PM)
443 * @return boolean
444 * @param letter char
445 */
446 private boolean isVowel(char letter)
447 {
448 for (int i = 0; i < vowels.length; i++)
449 {
450 if (letter == charset[vowels[i]])
451 return true;
452 }
453 return false;
454 }
455
456 /**
457 * Noun endings.
458 * Creation date: (17/03/2002 12:14:58 AM)
459 * @param stemmingZone java.lang.StringBuffer
460 */
461 private boolean noun(StringBuffer stemmingZone)
462 {
463 return findAndRemoveEnding(stemmingZone, nounEndings);
464 }
465
466 /**
467 * Perfective gerund endings.
468 * Creation date: (17/03/2002 12:14:58 AM)
469 * @param stemmingZone java.lang.StringBuffer
470 */
471 private boolean perfectiveGerund(StringBuffer stemmingZone)
472 {
473 return findAndRemoveEnding(
474 stemmingZone,
475 perfectiveGerundEndings1,
476 perfectiveGerund1Predessors)
477 || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
478 }
479
480 /**
481 * Reflexive endings.
482 * Creation date: (17/03/2002 12:14:58 AM)
483 * @param stemmingZone java.lang.StringBuffer
484 */
485 private boolean reflexive(StringBuffer stemmingZone)
486 {
487 return findAndRemoveEnding(stemmingZone, reflexiveEndings);
488 }
489
490 /**
491 * Insert the method's description here.
492 * Creation date: (17/03/2002 12:14:58 AM)
493 * @param stemmingZone java.lang.StringBuffer
494 */
495 private boolean removeI(StringBuffer stemmingZone)
496 {
497 if (stemmingZone.length() > 0
498 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
499 {
500 stemmingZone.setLength(stemmingZone.length() - 1);
501 return true;
502 }
503 else
504 {
505 return false;
506 }
507 }
508
509 /**
510 * Insert the method's description here.
511 * Creation date: (17/03/2002 12:14:58 AM)
512 * @param stemmingZone java.lang.StringBuffer
513 */
514 private boolean removeSoft(StringBuffer stemmingZone)
515 {
516 if (stemmingZone.length() > 0
517 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
518 {
519 stemmingZone.setLength(stemmingZone.length() - 1);
520 return true;
521 }
522 else
523 {
524 return false;
525 }
526 }
527
528 /**
529 * Insert the method's description here.
530 * Creation date: (16/03/2002 10:58:42 PM)
531 * @param newCharset char[]
532 */
533 public void setCharset(char[] newCharset)
534 {
535 charset = newCharset;
536 }
537
538 /**
539 * Finds the stem for given Russian word.
540 * Creation date: (16/03/2002 3:36:48 PM)
541 * @return java.lang.String
542 * @param input java.lang.String
543 */
544 public String stem(String input)
545 {
546 markPositions(input);
547 if (RV == 0)
548 return input; //RV wasn't detected, nothing to stem
549 StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
550 // stemming goes on in RV
551 // Step 1
552
553 if (!perfectiveGerund(stemmingZone))
554 {
555 reflexive(stemmingZone);
556 // variable r is unused, we are just interested in the flow that gets
557 // created by logical expression: apply adjectival(); if that fails,
558 // apply verb() etc
559 boolean r =
560 adjectival(stemmingZone)
561 || verb(stemmingZone)
562 || noun(stemmingZone);
563 }
564 // Step 2
565 removeI(stemmingZone);
566 // Step 3
567 derivational(stemmingZone);
568 // Step 4
569 superlative(stemmingZone);
570 undoubleN(stemmingZone);
571 removeSoft(stemmingZone);
572 // return result
573 return input.substring(0, RV) + stemmingZone.toString();
574 }
575
576 /**
577 * Superlative endings.
578 * Creation date: (17/03/2002 12:14:58 AM)
579 * @param stemmingZone java.lang.StringBuffer
580 */
581 private boolean superlative(StringBuffer stemmingZone)
582 {
583 return findAndRemoveEnding(stemmingZone, superlativeEndings);
584 }
585
586 /**
587 * Undoubles N.
588 * Creation date: (17/03/2002 12:14:58 AM)
589 * @param stemmingZone java.lang.StringBuffer
590 */
591 private boolean undoubleN(StringBuffer stemmingZone)
592 {
593 char[][] doubleN = {
594 { N, N }
595 };
596 if (findEnding(stemmingZone, doubleN) != 0)
597 {
598 stemmingZone.setLength(stemmingZone.length() - 1);
599 return true;
600 }
601 else
602 {
603 return false;
604 }
605 }
606
607 /**
608 * Verb endings.
609 * Creation date: (17/03/2002 12:14:58 AM)
610 * @param stemmingZone java.lang.StringBuffer
611 */
612 private boolean verb(StringBuffer stemmingZone)
613 {
614 return findAndRemoveEnding(
615 stemmingZone,
616 verbEndings1,
617 verb1Predessors)
618 || findAndRemoveEnding(stemmingZone, verbEndings2);
619 }
620
621 /**
622 * Static method for stemming with different charsets
623 */
624 public static String stem(String theWord, char[] charset)
625 {
626 RussianStemmer stemmer = new RussianStemmer();
627 stemmer.setCharset(charset);
628 return stemmer.stem(theWord);
629 }
630 }