Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/GermanStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang; | |
2 // This file is encoded in UTF-8 | |
3 | |
4 /** | |
5 * Licensed to the Apache Software Foundation (ASF) under one or more | |
6 * contributor license agreements. See the NOTICE file distributed with | |
7 * this work for additional information regarding copyright ownership. | |
8 * The ASF licenses this file to You under the Apache License, Version 2.0 | |
9 * (the "License"); you may not use this file except in compliance with | |
10 * the License. You may obtain a copy of the License at | |
11 * | |
12 * http://www.apache.org/licenses/LICENSE-2.0 | |
13 * | |
14 * Unless required by applicable law or agreed to in writing, software | |
15 * distributed under the License is distributed on an "AS IS" BASIS, | |
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
17 * See the License for the specific language governing permissions and | |
18 * limitations under the License. | |
19 */ | |
20 | |
21 /** | |
22 * A stemmer for German words. The algorithm is based on the report | |
23 * "A Fast and Simple Stemming Algorithm for German Words" by Jörg | |
24 * Caumanns (joerg.caumanns at isst.fhg.de). | |
25 * | |
26 * | |
27 * @version $Id: GermanStemmer.java 564236 2007-08-09 15:21:19Z gsingers $ | |
28 */ | |
29 public class GermanStemmer | |
30 { | |
31 /** | |
32 * Buffer for the terms while stemming them. | |
33 */ | |
34 private StringBuffer sb = new StringBuffer(); | |
35 | |
36 /** | |
37 * Amount of characters that are removed with <tt>substitute()</tt> while stemming. | |
38 */ | |
39 private int substCount = 0; | |
40 | |
41 /** | |
42 * Stemms the given term to an unique <tt>discriminator</tt>. | |
43 * | |
44 * @param term The term that should be stemmed. | |
45 * @return Discriminator for <tt>term</tt> | |
46 */ | |
47 public String stem( String term ) | |
48 { | |
49 // Use lowercase for medium stemming. | |
50 term = term.toLowerCase(); | |
51 if ( !isStemmable( term ) ) | |
52 return term; | |
53 // Reset the StringBuffer. | |
54 sb.delete( 0, sb.length() ); | |
55 sb.insert( 0, term ); | |
56 // Stemming starts here... | |
57 substitute( sb ); | |
58 strip( sb ); | |
59 optimize( sb ); | |
60 resubstitute( sb ); | |
61 removeParticleDenotion( sb ); | |
62 return sb.toString(); | |
63 } | |
64 | |
65 /** | |
66 * Checks if a term could be stemmed. | |
67 * | |
68 * @return true if, and only if, the given term consists in letters. | |
69 */ | |
70 private boolean isStemmable( String term ) | |
71 { | |
72 for ( int c = 0; c < term.length(); c++ ) { | |
73 if ( !Character.isLetter( term.charAt( c ) ) ) | |
74 return false; | |
75 } | |
76 return true; | |
77 } | |
78 | |
79 /** | |
80 * suffix stripping (stemming) on the current term. The stripping is reduced | |
81 * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", | |
82 * from which all regular suffixes are build of. The simplification causes | |
83 * some overstemming, and way more irregular stems, but still provides unique. | |
84 * discriminators in the most of those cases. | |
85 * The algorithm is context free, except of the length restrictions. | |
86 */ | |
87 private void strip( StringBuffer buffer ) | |
88 { | |
89 boolean doMore = true; | |
90 while ( doMore && buffer.length() > 3 ) { | |
91 if ( ( buffer.length() + substCount > 5 ) && | |
92 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) | |
93 { | |
94 buffer.delete( buffer.length() - 2, buffer.length() ); | |
95 } | |
96 else if ( ( buffer.length() + substCount > 4 ) && | |
97 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { | |
98 buffer.delete( buffer.length() - 2, buffer.length() ); | |
99 } | |
100 else if ( ( buffer.length() + substCount > 4 ) && | |
101 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { | |
102 buffer.delete( buffer.length() - 2, buffer.length() ); | |
103 } | |
104 else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { | |
105 buffer.deleteCharAt( buffer.length() - 1 ); | |
106 } | |
107 else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { | |
108 buffer.deleteCharAt( buffer.length() - 1 ); | |
109 } | |
110 else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { | |
111 buffer.deleteCharAt( buffer.length() - 1 ); | |
112 } | |
113 // "t" occurs only as suffix of verbs. | |
114 else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { | |
115 buffer.deleteCharAt( buffer.length() - 1 ); | |
116 } | |
117 else { | |
118 doMore = false; | |
119 } | |
120 } | |
121 } | |
122 | |
123 /** | |
124 * Does some optimizations on the term. This optimisations are | |
125 * contextual. | |
126 */ | |
127 private void optimize( StringBuffer buffer ) | |
128 { | |
129 // Additional step for female plurals of professions and inhabitants. | |
130 if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { | |
131 buffer.deleteCharAt( buffer.length() -1 ); | |
132 strip( buffer ); | |
133 } | |
134 // Additional step for irregular plural nouns like "Matrizen -> Matrix". | |
135 if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { | |
136 buffer.setCharAt( buffer.length() - 1, 'x' ); | |
137 } | |
138 } | |
139 | |
140 /** | |
141 * Removes a particle denotion ("ge") from a term. | |
142 */ | |
143 private void removeParticleDenotion( StringBuffer buffer ) | |
144 { | |
145 if ( buffer.length() > 4 ) { | |
146 for ( int c = 0; c < buffer.length() - 3; c++ ) { | |
147 if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { | |
148 buffer.delete( c, c + 2 ); | |
149 return; | |
150 } | |
151 } | |
152 } | |
153 } | |
154 | |
155 /** | |
156 * Do some substitutions for the term to reduce overstemming: | |
157 * | |
158 * - Substitute Umlauts with their corresponding vowel: äöü -> aou, | |
159 * "ß" is substituted by "ss" | |
160 * - Substitute a second char of a pair of equal characters with | |
161 * an asterisk: ?? -> ?* | |
162 * - Substitute some common character combinations with a token: | |
163 * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! | |
164 */ | |
165 private void substitute( StringBuffer buffer ) | |
166 { | |
167 substCount = 0; | |
168 for ( int c = 0; c < buffer.length(); c++ ) { | |
169 // Replace the second char of a pair of the equal characters with an asterisk | |
170 if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { | |
171 buffer.setCharAt( c, '*' ); | |
172 } | |
173 // Substitute Umlauts. | |
174 else if ( buffer.charAt( c ) == 'ä' ) { | |
175 buffer.setCharAt( c, 'a' ); | |
176 } | |
177 else if ( buffer.charAt( c ) == 'ö' ) { | |
178 buffer.setCharAt( c, 'o' ); | |
179 } | |
180 else if ( buffer.charAt( c ) == 'ü' ) { | |
181 buffer.setCharAt( c, 'u' ); | |
182 } | |
183 // Fix bug so that 'ß' at the end of a word is replaced. | |
184 else if ( buffer.charAt( c ) == 'ß' ) { | |
185 buffer.setCharAt( c, 's' ); | |
186 buffer.insert( c + 1, 's' ); | |
187 substCount++; | |
188 } | |
189 // Take care that at least one character is left left side from the current one | |
190 if ( c < buffer.length() - 1 ) { | |
191 // Masking several common character combinations with an token | |
192 if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && | |
193 buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) | |
194 { | |
195 buffer.setCharAt( c, '$' ); | |
196 buffer.delete( c + 1, c + 3 ); | |
197 substCount =+ 2; | |
198 } | |
199 else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { | |
200 buffer.setCharAt( c, '§' ); | |
201 buffer.deleteCharAt( c + 1 ); | |
202 substCount++; | |
203 } | |
204 else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { | |
205 buffer.setCharAt( c, '%' ); | |
206 buffer.deleteCharAt( c + 1 ); | |
207 substCount++; | |
208 } | |
209 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { | |
210 buffer.setCharAt( c, '&' ); | |
211 buffer.deleteCharAt( c + 1 ); | |
212 substCount++; | |
213 } | |
214 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { | |
215 buffer.setCharAt( c, '#' ); | |
216 buffer.deleteCharAt( c + 1 ); | |
217 substCount++; | |
218 } | |
219 else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { | |
220 buffer.setCharAt( c, '!' ); | |
221 buffer.deleteCharAt( c + 1 ); | |
222 substCount++; | |
223 } | |
224 } | |
225 } | |
226 } | |
227 | |
228 /** | |
229 * Undoes the changes made by substitute(). That are character pairs and | |
230 * character combinations. Umlauts will remain as their corresponding vowel, | |
231 * as "ß" remains as "ss". | |
232 */ | |
233 private void resubstitute( StringBuffer buffer ) | |
234 { | |
235 for ( int c = 0; c < buffer.length(); c++ ) { | |
236 if ( buffer.charAt( c ) == '*' ) { | |
237 char x = buffer.charAt( c - 1 ); | |
238 buffer.setCharAt( c, x ); | |
239 } | |
240 else if ( buffer.charAt( c ) == '$' ) { | |
241 buffer.setCharAt( c, 's' ); | |
242 buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); | |
243 } | |
244 else if ( buffer.charAt( c ) == '§' ) { | |
245 buffer.setCharAt( c, 'c' ); | |
246 buffer.insert( c + 1, 'h' ); | |
247 } | |
248 else if ( buffer.charAt( c ) == '%' ) { | |
249 buffer.setCharAt( c, 'e' ); | |
250 buffer.insert( c + 1, 'i' ); | |
251 } | |
252 else if ( buffer.charAt( c ) == '&' ) { | |
253 buffer.setCharAt( c, 'i' ); | |
254 buffer.insert( c + 1, 'e' ); | |
255 } | |
256 else if ( buffer.charAt( c ) == '#' ) { | |
257 buffer.setCharAt( c, 'i' ); | |
258 buffer.insert( c + 1, 'g' ); | |
259 } | |
260 else if ( buffer.charAt( c ) == '!' ) { | |
261 buffer.setCharAt( c, 's' ); | |
262 buffer.insert( c + 1, 't' ); | |
263 } | |
264 } | |
265 } | |
266 | |
267 } |