Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/FrenchStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
2 | |
3 /** | |
4 * Licensed to the Apache Software Foundation (ASF) under one or more | |
5 * contributor license agreements. See the NOTICE file distributed with | |
6 * this work for additional information regarding copyright ownership. | |
7 * The ASF licenses this file to You under the Apache License, Version 2.0 | |
8 * (the "License"); you may not use this file except in compliance with | |
9 * the License. You may obtain a copy of the License at | |
10 * | |
11 * http://www.apache.org/licenses/LICENSE-2.0 | |
12 * | |
13 * Unless required by applicable law or agreed to in writing, software | |
14 * distributed under the License is distributed on an "AS IS" BASIS, | |
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
16 * See the License for the specific language governing permissions and | |
17 * limitations under the License. | |
18 */ | |
19 | |
20 /** | |
21 * A stemmer for French words. The algorithm is based on the work of | |
22 * Dr Martin Porter on his snowball project<br> | |
23 * refer to http://snowball.sourceforge.net/french/stemmer.html<br> | |
24 * (French stemming algorithm) for details | |
25 * | |
26 * @author Patrick Talbot | |
27 */ | |
28 | |
29 public class FrenchStemmer { | |
30 | |
31 /** | |
32 * Buffer for the terms while stemming them. | |
33 */ | |
34 private StringBuffer sb = new StringBuffer(); | |
35 | |
36 /** | |
37 * A temporary buffer, used to reconstruct R2 | |
38 */ | |
39 private StringBuffer tb = new StringBuffer(); | |
40 | |
41 /** | |
42 * Region R0 is equal to the whole buffer | |
43 */ | |
44 private String R0; | |
45 | |
46 /** | |
47 * Region RV | |
48 * "If the word begins with two vowels, RV is the region after the third letter, | |
49 * otherwise the region after the first vowel not at the beginning of the word, | |
50 * or the end of the word if these positions cannot be found." | |
51 */ | |
52 private String RV; | |
53 | |
54 /** | |
55 * Region R1 | |
56 * "R1 is the region after the first non-vowel following a vowel | |
57 * or is the null region at the end of the word if there is no such non-vowel" | |
58 */ | |
59 private String R1; | |
60 | |
61 /** | |
62 * Region R2 | |
63 * "R2 is the region after the first non-vowel in R1 following a vowel | |
64 * or is the null region at the end of the word if there is no such non-vowel" | |
65 */ | |
66 private String R2; | |
67 | |
68 | |
69 /** | |
70 * Set to true if we need to perform step 2 | |
71 */ | |
72 private boolean suite; | |
73 | |
74 /** | |
75 * Set to true if the buffer was modified | |
76 */ | |
77 private boolean modified; | |
78 | |
79 | |
80 /** | |
81 * Stemms the given term to a unique <tt>discriminator</tt>. | |
82 * | |
83 * @param term java.langString The term that should be stemmed | |
84 * @return java.lang.String Discriminator for <tt>term</tt> | |
85 */ | |
86 public String stem( String term ) { | |
87 if ( !isStemmable( term ) ) { | |
88 return term; | |
89 } | |
90 | |
91 // Use lowercase for medium stemming. | |
92 term = term.toLowerCase(); | |
93 | |
94 // Reset the StringBuffer. | |
95 sb.delete( 0, sb.length() ); | |
96 sb.insert( 0, term ); | |
97 | |
98 // reset the booleans | |
99 modified = false; | |
100 suite = false; | |
101 | |
102 sb = treatVowels( sb ); | |
103 | |
104 setStrings(); | |
105 | |
106 step1(); | |
107 | |
108 if (!modified || suite) | |
109 { | |
110 if (RV != null) | |
111 { | |
112 suite = step2a(); | |
113 if (!suite) | |
114 step2b(); | |
115 } | |
116 } | |
117 | |
118 if (modified || suite) | |
119 step3(); | |
120 else | |
121 step4(); | |
122 | |
123 step5(); | |
124 | |
125 step6(); | |
126 | |
127 return sb.toString(); | |
128 } | |
129 | |
130 /** | |
131 * Sets the search region Strings<br> | |
132 * it needs to be done each time the buffer was modified | |
133 */ | |
134 private void setStrings() { | |
135 // set the strings | |
136 R0 = sb.toString(); | |
137 RV = retrieveRV( sb ); | |
138 R1 = retrieveR( sb ); | |
139 if ( R1 != null ) | |
140 { | |
141 tb.delete( 0, tb.length() ); | |
142 tb.insert( 0, R1 ); | |
143 R2 = retrieveR( tb ); | |
144 } | |
145 else | |
146 R2 = null; | |
147 } | |
148 | |
149 /** | |
150 * First step of the Porter Algorithmn<br> | |
151 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
152 */ | |
153 private void step1( ) { | |
154 String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; | |
155 deleteFrom( R2, suffix ); | |
156 | |
157 replaceFrom( R2, new String[] { "logies", "logie" }, "log" ); | |
158 replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); | |
159 replaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); | |
160 | |
161 String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; | |
162 deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); | |
163 | |
164 deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); | |
165 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); | |
166 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); | |
167 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); | |
168 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); | |
169 | |
170 deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); | |
171 deleteFrom( RV, new String[] { "ements", "ement" } ); | |
172 | |
173 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); | |
174 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); | |
175 deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); | |
176 | |
177 String[] autre = { "ifs", "ives", "if", "ive" }; | |
178 deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); | |
179 deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); | |
180 | |
181 replaceFrom( R0, new String[] { "eaux" }, "eau" ); | |
182 | |
183 replaceFrom( R1, new String[] { "aux" }, "al" ); | |
184 | |
185 deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); | |
186 | |
187 deleteFrom( R2, new String[] { "eux" } ); | |
188 | |
189 // if one of the next steps is performed, we will need to perform step2a | |
190 boolean temp = false; | |
191 temp = replaceFrom( RV, new String[] { "amment" }, "ant" ); | |
192 if (temp == true) | |
193 suite = true; | |
194 temp = replaceFrom( RV, new String[] { "emment" }, "ent" ); | |
195 if (temp == true) | |
196 suite = true; | |
197 temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); | |
198 if (temp == true) | |
199 suite = true; | |
200 | |
201 } | |
202 | |
203 /** | |
204 * Second step (A) of the Porter Algorithmn<br> | |
205 * Will be performed if nothing changed from the first step | |
206 * or changed were done in the amment, emment, ments or ment suffixes<br> | |
207 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
208 * | |
209 * @return boolean - true if something changed in the StringBuffer | |
210 */ | |
211 private boolean step2a() { | |
212 String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", | |
213 "irent", "iriez", "irez", "irions", "irons", "iront", | |
214 "issaIent", "issais", "issantes", "issante", "issants", "issant", | |
215 "issait", "issais", "issions", "issons", "issiez", "issez", "issent", | |
216 "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; | |
217 return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); | |
218 } | |
219 | |
220 /** | |
221 * Second step (B) of the Porter Algorithmn<br> | |
222 * Will be performed if step 2 A was performed unsuccessfully<br> | |
223 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
224 */ | |
225 private void step2b() { | |
226 String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", | |
227 "erons", "eront","erez", "èrent", "era", "ées", "iez", | |
228 "ée", "és", "er", "ez", "é" }; | |
229 deleteFrom( RV, suffix ); | |
230 | |
231 String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", | |
232 "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", | |
233 "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; | |
234 deleteButSuffixFrom( RV, search, "e", true ); | |
235 | |
236 deleteFrom( R2, new String[] { "ions" } ); | |
237 } | |
238 | |
239 /** | |
240 * Third step of the Porter Algorithmn<br> | |
241 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
242 */ | |
243 private void step3() { | |
244 if (sb.length()>0) | |
245 { | |
246 char ch = sb.charAt( sb.length()-1 ); | |
247 if (ch == 'Y') | |
248 { | |
249 sb.setCharAt( sb.length()-1, 'i' ); | |
250 setStrings(); | |
251 } | |
252 else if (ch == 'ç') | |
253 { | |
254 sb.setCharAt( sb.length()-1, 'c' ); | |
255 setStrings(); | |
256 } | |
257 } | |
258 } | |
259 | |
260 /** | |
261 * Fourth step of the Porter Algorithmn<br> | |
262 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
263 */ | |
264 private void step4() { | |
265 if (sb.length() > 1) | |
266 { | |
267 char ch = sb.charAt( sb.length()-1 ); | |
268 if (ch == 's') | |
269 { | |
270 char b = sb.charAt( sb.length()-2 ); | |
271 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') | |
272 { | |
273 sb.delete( sb.length() - 1, sb.length()); | |
274 setStrings(); | |
275 } | |
276 } | |
277 } | |
278 boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); | |
279 if (!found) | |
280 found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); | |
281 | |
282 replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); | |
283 deleteFrom( RV, new String[] { "e" } ); | |
284 deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" ); | |
285 } | |
286 | |
287 /** | |
288 * Fifth step of the Porter Algorithmn<br> | |
289 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
290 */ | |
291 private void step5() { | |
292 if (R0 != null) | |
293 { | |
294 if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) | |
295 { | |
296 sb.delete( sb.length() - 1, sb.length() ); | |
297 setStrings(); | |
298 } | |
299 } | |
300 } | |
301 | |
302 /** | |
303 * Sixth (and last!) step of the Porter Algorithmn<br> | |
304 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation | |
305 */ | |
306 private void step6() { | |
307 if (R0!=null && R0.length()>0) | |
308 { | |
309 boolean seenVowel = false; | |
310 boolean seenConson = false; | |
311 int pos = -1; | |
312 for (int i = R0.length()-1; i > -1; i--) | |
313 { | |
314 char ch = R0.charAt(i); | |
315 if (isVowel(ch)) | |
316 { | |
317 if (!seenVowel) | |
318 { | |
319 if (ch == 'é' || ch == 'è') | |
320 { | |
321 pos = i; | |
322 break; | |
323 } | |
324 } | |
325 seenVowel = true; | |
326 } | |
327 else | |
328 { | |
329 if (seenVowel) | |
330 break; | |
331 else | |
332 seenConson = true; | |
333 } | |
334 } | |
335 if (pos > -1 && seenConson && !seenVowel) | |
336 sb.setCharAt(pos, 'e'); | |
337 } | |
338 } | |
339 | |
340 /** | |
341 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string | |
342 * | |
343 * @param source java.lang.String - the primary source zone for search | |
344 * @param search java.lang.String[] - the strings to search for suppression | |
345 * @param from java.lang.String - the secondary source zone for search | |
346 * @param prefix java.lang.String - the prefix to add to the search string to test | |
347 * @return boolean - true if modified | |
348 */ | |
349 private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { | |
350 boolean found = false; | |
351 if (source!=null ) | |
352 { | |
353 for (int i = 0; i < search.length; i++) { | |
354 if ( source.endsWith( search[i] )) | |
355 { | |
356 if (from!=null && from.endsWith( prefix + search[i] )) | |
357 { | |
358 sb.delete( sb.length() - search[i].length(), sb.length()); | |
359 found = true; | |
360 setStrings(); | |
361 break; | |
362 } | |
363 } | |
364 } | |
365 } | |
366 return found; | |
367 } | |
368 | |
369 /** | |
370 * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel | |
371 * | |
372 * @param source java.lang.String - the primary source zone for search | |
373 * @param search java.lang.String[] - the strings to search for suppression | |
374 * @param vowel boolean - true if we need a vowel before the search string | |
375 * @param from java.lang.String - the secondary source zone for search (where vowel could be) | |
376 * @return boolean - true if modified | |
377 */ | |
378 private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) { | |
379 boolean found = false; | |
380 if (source!=null && from!=null) | |
381 { | |
382 for (int i = 0; i < search.length; i++) { | |
383 if ( source.endsWith( search[i] )) | |
384 { | |
385 if ((search[i].length() + 1) <= from.length()) | |
386 { | |
387 boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); | |
388 if (test == vowel) | |
389 { | |
390 sb.delete( sb.length() - search[i].length(), sb.length()); | |
391 modified = true; | |
392 found = true; | |
393 setStrings(); | |
394 break; | |
395 } | |
396 } | |
397 } | |
398 } | |
399 } | |
400 return found; | |
401 } | |
402 | |
403 /** | |
404 * Delete a suffix searched in zone "source" if preceded by the prefix | |
405 * | |
406 * @param source java.lang.String - the primary source zone for search | |
407 * @param search java.lang.String[] - the strings to search for suppression | |
408 * @param prefix java.lang.String - the prefix to add to the search string to test | |
409 * @param without boolean - true if it will be deleted even without prefix found | |
410 */ | |
411 private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) { | |
412 if (source!=null) | |
413 { | |
414 for (int i = 0; i < search.length; i++) { | |
415 if ( source.endsWith( prefix + search[i] )) | |
416 { | |
417 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); | |
418 modified = true; | |
419 setStrings(); | |
420 break; | |
421 } | |
422 else if ( without && source.endsWith( search[i] )) | |
423 { | |
424 sb.delete( sb.length() - search[i].length(), sb.length() ); | |
425 modified = true; | |
426 setStrings(); | |
427 break; | |
428 } | |
429 } | |
430 } | |
431 } | |
432 | |
433 /** | |
434 * Delete a suffix searched in zone "source" if preceded by prefix<br> | |
435 * or replace it with the replace string if preceded by the prefix in the zone "from"<br> | |
436 * or delete the suffix if specified | |
437 * | |
438 * @param source java.lang.String - the primary source zone for search | |
439 * @param search java.lang.String[] - the strings to search for suppression | |
440 * @param prefix java.lang.String - the prefix to add to the search string to test | |
441 * @param without boolean - true if it will be deleted even without prefix found | |
442 */ | |
443 private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) { | |
444 if (source!=null) | |
445 { | |
446 for (int i = 0; i < search.length; i++) { | |
447 if ( source.endsWith( prefix + search[i] )) | |
448 { | |
449 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); | |
450 modified = true; | |
451 setStrings(); | |
452 break; | |
453 } | |
454 else if ( from!=null && from.endsWith( prefix + search[i] )) | |
455 { | |
456 sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); | |
457 modified = true; | |
458 setStrings(); | |
459 break; | |
460 } | |
461 else if ( without && source.endsWith( search[i] )) | |
462 { | |
463 sb.delete( sb.length() - search[i].length(), sb.length() ); | |
464 modified = true; | |
465 setStrings(); | |
466 break; | |
467 } | |
468 } | |
469 } | |
470 } | |
471 | |
472 /** | |
473 * Replace a search string with another within the source zone | |
474 * | |
475 * @param source java.lang.String - the source zone for search | |
476 * @param search java.lang.String[] - the strings to search for replacement | |
477 * @param replace java.lang.String - the replacement string | |
478 */ | |
479 private boolean replaceFrom( String source, String[] search, String replace ) { | |
480 boolean found = false; | |
481 if (source!=null) | |
482 { | |
483 for (int i = 0; i < search.length; i++) { | |
484 if ( source.endsWith( search[i] )) | |
485 { | |
486 sb.replace( sb.length() - search[i].length(), sb.length(), replace ); | |
487 modified = true; | |
488 found = true; | |
489 setStrings(); | |
490 break; | |
491 } | |
492 } | |
493 } | |
494 return found; | |
495 } | |
496 | |
497 /** | |
498 * Delete a search string within the source zone | |
499 * | |
500 * @param source the source zone for search | |
501 * @param suffix the strings to search for suppression | |
502 */ | |
503 private void deleteFrom(String source, String[] suffix ) { | |
504 if (source!=null) | |
505 { | |
506 for (int i = 0; i < suffix.length; i++) { | |
507 if (source.endsWith( suffix[i] )) | |
508 { | |
509 sb.delete( sb.length() - suffix[i].length(), sb.length()); | |
510 modified = true; | |
511 setStrings(); | |
512 break; | |
513 } | |
514 } | |
515 } | |
516 } | |
517 | |
518 /** | |
519 * Test if a char is a french vowel, including accentuated ones | |
520 * | |
521 * @param ch the char to test | |
522 * @return boolean - true if the char is a vowel | |
523 */ | |
524 private boolean isVowel(char ch) { | |
525 switch (ch) | |
526 { | |
527 case 'a': | |
528 case 'e': | |
529 case 'i': | |
530 case 'o': | |
531 case 'u': | |
532 case 'y': | |
533 case 'â': | |
534 case 'à': | |
535 case 'ë': | |
536 case 'é': | |
537 case 'ê': | |
538 case 'è': | |
539 case 'ï': | |
540 case 'î': | |
541 case 'ô': | |
542 case 'ü': | |
543 case 'ù': | |
544 case 'û': | |
545 return true; | |
546 default: | |
547 return false; | |
548 } | |
549 } | |
550 | |
551 /** | |
552 * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br> | |
553 * "R is the region after the first non-vowel following a vowel | |
554 * or is the null region at the end of the word if there is no such non-vowel"<br> | |
555 * @param buffer java.lang.StringBuffer - the in buffer | |
556 * @return java.lang.String - the resulting string | |
557 */ | |
558 private String retrieveR( StringBuffer buffer ) { | |
559 int len = buffer.length(); | |
560 int pos = -1; | |
561 for (int c = 0; c < len; c++) { | |
562 if (isVowel( buffer.charAt( c ))) | |
563 { | |
564 pos = c; | |
565 break; | |
566 } | |
567 } | |
568 if (pos > -1) | |
569 { | |
570 int consonne = -1; | |
571 for (int c = pos; c < len; c++) { | |
572 if (!isVowel(buffer.charAt( c ))) | |
573 { | |
574 consonne = c; | |
575 break; | |
576 } | |
577 } | |
578 if (consonne > -1 && (consonne+1) < len) | |
579 return buffer.substring( consonne+1, len ); | |
580 else | |
581 return null; | |
582 } | |
583 else | |
584 return null; | |
585 } | |
586 | |
587 /** | |
588 * Retrieve the "RV zone" from a buffer an return the corresponding string<br> | |
589 * "If the word begins with two vowels, RV is the region after the third letter, | |
590 * otherwise the region after the first vowel not at the beginning of the word, | |
591 * or the end of the word if these positions cannot be found."<br> | |
592 * @param buffer java.lang.StringBuffer - the in buffer | |
593 * @return java.lang.String - the resulting string | |
594 */ | |
595 private String retrieveRV( StringBuffer buffer ) { | |
596 int len = buffer.length(); | |
597 if ( buffer.length() > 3) | |
598 { | |
599 if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { | |
600 return buffer.substring(3,len); | |
601 } | |
602 else | |
603 { | |
604 int pos = 0; | |
605 for (int c = 1; c < len; c++) { | |
606 if (isVowel( buffer.charAt( c ))) | |
607 { | |
608 pos = c; | |
609 break; | |
610 } | |
611 } | |
612 if ( pos+1 < len ) | |
613 return buffer.substring( pos+1, len ); | |
614 else | |
615 return null; | |
616 } | |
617 } | |
618 else | |
619 return null; | |
620 } | |
621 | |
622 | |
623 | |
624 /** | |
625 * Turns u and i preceded AND followed by a vowel to UpperCase<br> | |
626 * Turns y preceded OR followed by a vowel to UpperCase<br> | |
627 * Turns u preceded by q to UpperCase<br> | |
628 * | |
629 * @param buffer java.util.StringBuffer - the buffer to treat | |
630 * @return java.util.StringBuffer - the treated buffer | |
631 */ | |
632 private StringBuffer treatVowels( StringBuffer buffer ) { | |
633 for ( int c = 0; c < buffer.length(); c++ ) { | |
634 char ch = buffer.charAt( c ); | |
635 | |
636 if (c == 0) // first char | |
637 { | |
638 if (buffer.length()>1) | |
639 { | |
640 if (ch == 'y' && isVowel(buffer.charAt( c + 1 ))) | |
641 buffer.setCharAt( c, 'Y' ); | |
642 } | |
643 } | |
644 else if (c == buffer.length()-1) // last char | |
645 { | |
646 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q') | |
647 buffer.setCharAt( c, 'U' ); | |
648 if (ch == 'y' && isVowel(buffer.charAt( c - 1 ))) | |
649 buffer.setCharAt( c, 'Y' ); | |
650 } | |
651 else // other cases | |
652 { | |
653 if (ch == 'u') | |
654 { | |
655 if (buffer.charAt( c - 1) == 'q') | |
656 buffer.setCharAt( c, 'U' ); | |
657 else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) | |
658 buffer.setCharAt( c, 'U' ); | |
659 } | |
660 if (ch == 'i') | |
661 { | |
662 if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) | |
663 buffer.setCharAt( c, 'I' ); | |
664 } | |
665 if (ch == 'y') | |
666 { | |
667 if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) | |
668 buffer.setCharAt( c, 'Y' ); | |
669 } | |
670 } | |
671 } | |
672 | |
673 return buffer; | |
674 } | |
675 | |
676 /** | |
677 * Checks a term if it can be processed correctly. | |
678 * | |
679 * @return boolean - true if, and only if, the given term consists in letters. | |
680 */ | |
681 private boolean isStemmable( String term ) { | |
682 boolean upper = false; | |
683 int first = -1; | |
684 for ( int c = 0; c < term.length(); c++ ) { | |
685 // Discard terms that contain non-letter characters. | |
686 if ( !Character.isLetter( term.charAt( c ) ) ) { | |
687 return false; | |
688 } | |
689 // Discard terms that contain multiple uppercase letters. | |
690 if ( Character.isUpperCase( term.charAt( c ) ) ) { | |
691 if ( upper ) { | |
692 return false; | |
693 } | |
694 // First encountered uppercase letter, set flag and save | |
695 // position. | |
696 else { | |
697 first = c; | |
698 upper = true; | |
699 } | |
700 } | |
701 } | |
702 // Discard the term if it contains a single uppercase letter that | |
703 // is not starting the term. | |
704 if ( first > 0 ) { | |
705 return false; | |
706 } | |
707 return true; | |
708 } | |
709 } |