Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/RussianStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,630 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). + * + * + * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ + */ +public class RussianStemmer +{ + private char[] charset; + + // positions of RV, R1 and R2 respectively + private int RV, R1, R2; + + // letters (currently unused letters are commented out) + private final static char A = 0; + //private final static char B = 1; + private final static char V = 2; + private final static char G = 3; + //private final static char D = 4; + private final static char E = 5; + //private final static char ZH = 6; + //private final static char Z = 7; + private final static char I = 8; + private final static char I_ = 9; + //private final static char K = 10; + private final static char L = 11; + private final static char M = 12; + private final static char N = 13; + private final static char O = 14; + //private final static char P = 15; + //private final static char R = 16; + private final static char S = 17; + private final static char T = 18; + private final static char U = 19; + //private final static char F = 20; + private final static char X = 21; + //private final static char TS = 22; + //private final static char CH = 23; + private final static char SH = 24; + private final static char SHCH = 25; + //private final static char HARD = 26; + private final static char Y = 27; + private final static char SOFT = 28; + private final static char AE = 29; + private final static char IU = 30; + private final static char IA = 31; + + // stem definitions + private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; + + private static char[][] perfectiveGerundEndings1 = { + { V }, + { V, SH, I }, + { V, SH, I, S, SOFT } + }; + + private static char[][] perfectiveGerund1Predessors = { + { A }, + { IA } + }; + + private static char[][] perfectiveGerundEndings2 = { { I, V }, { + Y, V }, { + I, V, SH, I }, { + Y, V, SH, I }, { + I, V, SH, I, S, SOFT }, { + Y, V, SH, I, S, SOFT } + }; + + private static char[][] adjectiveEndings = { + { E, E }, + { I, E }, + { Y, E }, + { O, E }, + { E, I_ }, + { I, I_ }, + { Y, I_ }, + { O, I_ }, + { E, M }, + { I, M }, + { Y, M }, + { O, M }, + { I, X }, + { Y, X }, + { U, IU }, + { IU, IU }, + { A, IA }, + { IA, IA }, + { O, IU }, + { E, IU }, + { I, M, I }, + { Y, M, I }, + { E, G, O }, + { O, G, O }, + { E, M, U }, + {O, M, U } + }; + + private static char[][] participleEndings1 = { + { SHCH }, + { E, M }, + { N, N }, + { V, SH }, + { IU, SHCH } + }; + + private static char[][] participleEndings2 = { + { I, V, SH }, + { Y, V, SH }, + { U, IU, SHCH } + }; + + private static char[][] participle1Predessors = { + { A }, + { IA } + }; + + private static char[][] reflexiveEndings = { + { S, IA }, + { S, SOFT } + }; + + private static char[][] verbEndings1 = { + { I_ }, + { L }, + { N }, + { L, O }, + { N, O }, + { E, T }, + { IU, T }, + { L, A }, + { N, A }, + { L, I }, + { E, M }, + { N, Y }, + { E, T, E }, + { I_, T, E }, + { T, SOFT }, + { E, SH, SOFT }, + { N, N, O } + }; + + private static char[][] verbEndings2 = { + { IU }, + { U, IU }, + { E, N }, + { E, I_ }, + { IA, T }, + { U, I_ }, + { I, L }, + { Y, L }, + { I, M }, + { Y, M }, + { I, T }, + { Y, T }, + { I, L, A }, + { Y, L, A }, + { E, N, A }, + { I, T, E }, + { I, L, I }, + { Y, L, I }, + { I, L, O }, + { Y, L, O }, + { E, N, O }, + { U, E, T }, + { U, IU, T }, + { E, N, Y }, + { I, T, SOFT }, + { Y, T, SOFT }, + { I, SH, SOFT }, + { E, I_, T, E }, + { U, I_, T, E } + }; + + private static char[][] verb1Predessors = { + { A }, + { IA } + }; + + private static char[][] nounEndings = { + { A }, + { U }, + { I_ }, + { O }, + { U }, + { E }, + { Y }, + { I }, + { SOFT }, + { IA }, + { E, V }, + { O, V }, + { I, E }, + { SOFT, E }, + { IA, X }, + { I, IU }, + { E, I }, + { I, I }, + { E, I_ }, + { O, I_ }, + { E, M }, + { A, M }, + { O, M }, + { A, X }, + { SOFT, IU }, + { I, IA }, + { SOFT, IA }, + { I, I_ }, + { IA, M }, + { IA, M, I }, + { A, M, I }, + { I, E, I_ }, + { I, IA, M }, + { I, E, M }, + { I, IA, X }, + { I, IA, M, I } + }; + + private static char[][] superlativeEndings = { + { E, I_, SH }, + { E, I_, SH, E } + }; + + private static char[][] derivationalEndings = { + { O, S, T }, + { O, S, T, SOFT } + }; + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer() + { + super(); + } + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer(char[] charset) + { + super(); + this.charset = charset; + } + + /** + * Adjectival ending is an adjective ending, + * optionally preceded by participle ending. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean adjectival(StringBuffer stemmingZone) + { + // look for adjective ending in a stemming zone + if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) + return false; + // if adjective ending was found, try for participle ending. + // variable r is unused, we are just interested in the side effect of + // findAndRemoveEnding(): + boolean r = + findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) + || + findAndRemoveEnding(stemmingZone, participleEndings2); + return true; + } + + /** + * Derivational endings + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean derivational(StringBuffer stemmingZone) + { + int endingLength = findEnding(stemmingZone, derivationalEndings); + if (endingLength == 0) + // no derivational ending found + return false; + else + { + // Ensure that the ending locates in R2 + if (R2 - RV <= stemmingZone.length() - endingLength) + { + stemmingZone.setLength(stemmingZone.length() - endingLength); + return true; + } + else + { + return false; + } + } + } + + /** + * Finds ending among given ending class and returns the length of ending found(0, if not found). + * Creation date: (17/03/2002 8:18:34 PM) + */ + private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) + { + boolean match = false; + for (int i = theEndingClass.length - 1; i >= 0; i--) + { + char[] theEnding = theEndingClass[i]; + // check if the ending is bigger than stemming zone + if (startIndex < theEnding.length - 1) + { + match = false; + continue; + } + match = true; + int stemmingIndex = startIndex; + for (int j = theEnding.length - 1; j >= 0; j--) + { + if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) + { + match = false; + break; + } + } + // check if ending was found + if (match) + { + return theEndingClass[i].length; // cut ending + } + } + return 0; + } + + private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); + } + + /** + * Finds the ending among the given class of endings and removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + /** + * Finds the ending among the given class of endings, then checks if this ending was + * preceded by any of given predessors, and if so, removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, + char[][] theEndingClass, char[][] thePredessors) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else + { + int predessorLength = + findEnding(stemmingZone, + stemmingZone.length() - endingLength - 1, + thePredessors); + if (predessorLength == 0) + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + } + + /** + * Marks positions of RV, R1 and R2 in a given word. + * Creation date: (16/03/2002 3:40:11 PM) + */ + private void markPositions(String word) + { + RV = 0; + R1 = 0; + R2 = 0; + int i = 0; + // find RV + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // RV zone is empty + RV = i; + // find R1 + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R1 zone is empty + R1 = i; + // find R2 + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + R2 = i; + } + + /** + * Checks if character is a vowel.. + * Creation date: (16/03/2002 10:47:03 PM) + * @return boolean + * @param letter char + */ + private boolean isVowel(char letter) + { + for (int i = 0; i < vowels.length; i++) + { + if (letter == charset[vowels[i]]) + return true; + } + return false; + } + + /** + * Noun endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean noun(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, nounEndings); + } + + /** + * Perfective gerund endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean perfectiveGerund(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + perfectiveGerundEndings1, + perfectiveGerund1Predessors) + || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); + } + + /** + * Reflexive endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean reflexive(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, reflexiveEndings); + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeI(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeSoft(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (16/03/2002 10:58:42 PM) + * @param newCharset char[] + */ + public void setCharset(char[] newCharset) + { + charset = newCharset; + } + + /** + * Finds the stem for given Russian word. + * Creation date: (16/03/2002 3:36:48 PM) + * @return java.lang.String + * @param input java.lang.String + */ + public String stem(String input) + { + markPositions(input); + if (RV == 0) + return input; //RV wasn't detected, nothing to stem + StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); + // stemming goes on in RV + // Step 1 + + if (!perfectiveGerund(stemmingZone)) + { + reflexive(stemmingZone); + // variable r is unused, we are just interested in the flow that gets + // created by logical expression: apply adjectival(); if that fails, + // apply verb() etc + boolean r = + adjectival(stemmingZone) + || verb(stemmingZone) + || noun(stemmingZone); + } + // Step 2 + removeI(stemmingZone); + // Step 3 + derivational(stemmingZone); + // Step 4 + superlative(stemmingZone); + undoubleN(stemmingZone); + removeSoft(stemmingZone); + // return result + return input.substring(0, RV) + stemmingZone.toString(); + } + + /** + * Superlative endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean superlative(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, superlativeEndings); + } + + /** + * Undoubles N. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean undoubleN(StringBuffer stemmingZone) + { + char[][] doubleN = { + { N, N } + }; + if (findEnding(stemmingZone, doubleN) != 0) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Verb endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean verb(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + verbEndings1, + verb1Predessors) + || findAndRemoveEnding(stemmingZone, verbEndings2); + } + + /** + * Static method for stemming with different charsets + */ + public static String stem(String theWord, char[] charset) + { + RussianStemmer stemmer = new RussianStemmer(); + stemmer.setCharset(charset); + return stemmer.stem(theWord); + } +}