comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/FrenchStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 /**
21 * A stemmer for French words. The algorithm is based on the work of
22 * Dr Martin Porter on his snowball project<br>
23 * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
24 * (French stemming algorithm) for details
25 *
26 * @author Patrick Talbot
27 */
28
29 public class FrenchStemmer {
30
31 /**
32 * Buffer for the terms while stemming them.
33 */
34 private StringBuffer sb = new StringBuffer();
35
36 /**
37 * A temporary buffer, used to reconstruct R2
38 */
39 private StringBuffer tb = new StringBuffer();
40
41 /**
42 * Region R0 is equal to the whole buffer
43 */
44 private String R0;
45
46 /**
47 * Region RV
48 * "If the word begins with two vowels, RV is the region after the third letter,
49 * otherwise the region after the first vowel not at the beginning of the word,
50 * or the end of the word if these positions cannot be found."
51 */
52 private String RV;
53
54 /**
55 * Region R1
56 * "R1 is the region after the first non-vowel following a vowel
57 * or is the null region at the end of the word if there is no such non-vowel"
58 */
59 private String R1;
60
61 /**
62 * Region R2
63 * "R2 is the region after the first non-vowel in R1 following a vowel
64 * or is the null region at the end of the word if there is no such non-vowel"
65 */
66 private String R2;
67
68
69 /**
70 * Set to true if we need to perform step 2
71 */
72 private boolean suite;
73
74 /**
75 * Set to true if the buffer was modified
76 */
77 private boolean modified;
78
79
80 /**
81 * Stemms the given term to a unique <tt>discriminator</tt>.
82 *
83 * @param term java.langString The term that should be stemmed
84 * @return java.lang.String Discriminator for <tt>term</tt>
85 */
86 public String stem( String term ) {
87 if ( !isStemmable( term ) ) {
88 return term;
89 }
90
91 // Use lowercase for medium stemming.
92 term = term.toLowerCase();
93
94 // Reset the StringBuffer.
95 sb.delete( 0, sb.length() );
96 sb.insert( 0, term );
97
98 // reset the booleans
99 modified = false;
100 suite = false;
101
102 sb = treatVowels( sb );
103
104 setStrings();
105
106 step1();
107
108 if (!modified || suite)
109 {
110 if (RV != null)
111 {
112 suite = step2a();
113 if (!suite)
114 step2b();
115 }
116 }
117
118 if (modified || suite)
119 step3();
120 else
121 step4();
122
123 step5();
124
125 step6();
126
127 return sb.toString();
128 }
129
130 /**
131 * Sets the search region Strings<br>
132 * it needs to be done each time the buffer was modified
133 */
134 private void setStrings() {
135 // set the strings
136 R0 = sb.toString();
137 RV = retrieveRV( sb );
138 R1 = retrieveR( sb );
139 if ( R1 != null )
140 {
141 tb.delete( 0, tb.length() );
142 tb.insert( 0, R1 );
143 R2 = retrieveR( tb );
144 }
145 else
146 R2 = null;
147 }
148
149 /**
150 * First step of the Porter Algorithmn<br>
151 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
152 */
153 private void step1( ) {
154 String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
155 deleteFrom( R2, suffix );
156
157 replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
158 replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
159 replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
160
161 String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
162 deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
163
164 deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
165 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
166 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
167 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
168 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
169
170 deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
171 deleteFrom( RV, new String[] { "ements", "ement" } );
172
173 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
174 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
175 deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
176
177 String[] autre = { "ifs", "ives", "if", "ive" };
178 deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
179 deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
180
181 replaceFrom( R0, new String[] { "eaux" }, "eau" );
182
183 replaceFrom( R1, new String[] { "aux" }, "al" );
184
185 deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
186
187 deleteFrom( R2, new String[] { "eux" } );
188
189 // if one of the next steps is performed, we will need to perform step2a
190 boolean temp = false;
191 temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
192 if (temp == true)
193 suite = true;
194 temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
195 if (temp == true)
196 suite = true;
197 temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
198 if (temp == true)
199 suite = true;
200
201 }
202
203 /**
204 * Second step (A) of the Porter Algorithmn<br>
205 * Will be performed if nothing changed from the first step
206 * or changed were done in the amment, emment, ments or ment suffixes<br>
207 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
208 *
209 * @return boolean - true if something changed in the StringBuffer
210 */
211 private boolean step2a() {
212 String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
213 "irent", "iriez", "irez", "irions", "irons", "iront",
214 "issaIent", "issais", "issantes", "issante", "issants", "issant",
215 "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
216 "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
217 return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
218 }
219
220 /**
221 * Second step (B) of the Porter Algorithmn<br>
222 * Will be performed if step 2 A was performed unsuccessfully<br>
223 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
224 */
225 private void step2b() {
226 String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
227 "erons", "eront","erez", "èrent", "era", "ées", "iez",
228 "ée", "és", "er", "ez", "é" };
229 deleteFrom( RV, suffix );
230
231 String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
232 "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
233 "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
234 deleteButSuffixFrom( RV, search, "e", true );
235
236 deleteFrom( R2, new String[] { "ions" } );
237 }
238
239 /**
240 * Third step of the Porter Algorithmn<br>
241 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
242 */
243 private void step3() {
244 if (sb.length()>0)
245 {
246 char ch = sb.charAt( sb.length()-1 );
247 if (ch == 'Y')
248 {
249 sb.setCharAt( sb.length()-1, 'i' );
250 setStrings();
251 }
252 else if (ch == 'ç')
253 {
254 sb.setCharAt( sb.length()-1, 'c' );
255 setStrings();
256 }
257 }
258 }
259
260 /**
261 * Fourth step of the Porter Algorithmn<br>
262 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
263 */
264 private void step4() {
265 if (sb.length() > 1)
266 {
267 char ch = sb.charAt( sb.length()-1 );
268 if (ch == 's')
269 {
270 char b = sb.charAt( sb.length()-2 );
271 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
272 {
273 sb.delete( sb.length() - 1, sb.length());
274 setStrings();
275 }
276 }
277 }
278 boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
279 if (!found)
280 found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
281
282 replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
283 deleteFrom( RV, new String[] { "e" } );
284 deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
285 }
286
287 /**
288 * Fifth step of the Porter Algorithmn<br>
289 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
290 */
291 private void step5() {
292 if (R0 != null)
293 {
294 if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
295 {
296 sb.delete( sb.length() - 1, sb.length() );
297 setStrings();
298 }
299 }
300 }
301
302 /**
303 * Sixth (and last!) step of the Porter Algorithmn<br>
304 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
305 */
306 private void step6() {
307 if (R0!=null && R0.length()>0)
308 {
309 boolean seenVowel = false;
310 boolean seenConson = false;
311 int pos = -1;
312 for (int i = R0.length()-1; i > -1; i--)
313 {
314 char ch = R0.charAt(i);
315 if (isVowel(ch))
316 {
317 if (!seenVowel)
318 {
319 if (ch == 'é' || ch == 'è')
320 {
321 pos = i;
322 break;
323 }
324 }
325 seenVowel = true;
326 }
327 else
328 {
329 if (seenVowel)
330 break;
331 else
332 seenConson = true;
333 }
334 }
335 if (pos > -1 && seenConson && !seenVowel)
336 sb.setCharAt(pos, 'e');
337 }
338 }
339
340 /**
341 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
342 *
343 * @param source java.lang.String - the primary source zone for search
344 * @param search java.lang.String[] - the strings to search for suppression
345 * @param from java.lang.String - the secondary source zone for search
346 * @param prefix java.lang.String - the prefix to add to the search string to test
347 * @return boolean - true if modified
348 */
349 private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
350 boolean found = false;
351 if (source!=null )
352 {
353 for (int i = 0; i < search.length; i++) {
354 if ( source.endsWith( search[i] ))
355 {
356 if (from!=null && from.endsWith( prefix + search[i] ))
357 {
358 sb.delete( sb.length() - search[i].length(), sb.length());
359 found = true;
360 setStrings();
361 break;
362 }
363 }
364 }
365 }
366 return found;
367 }
368
369 /**
370 * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
371 *
372 * @param source java.lang.String - the primary source zone for search
373 * @param search java.lang.String[] - the strings to search for suppression
374 * @param vowel boolean - true if we need a vowel before the search string
375 * @param from java.lang.String - the secondary source zone for search (where vowel could be)
376 * @return boolean - true if modified
377 */
378 private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
379 boolean found = false;
380 if (source!=null && from!=null)
381 {
382 for (int i = 0; i < search.length; i++) {
383 if ( source.endsWith( search[i] ))
384 {
385 if ((search[i].length() + 1) <= from.length())
386 {
387 boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
388 if (test == vowel)
389 {
390 sb.delete( sb.length() - search[i].length(), sb.length());
391 modified = true;
392 found = true;
393 setStrings();
394 break;
395 }
396 }
397 }
398 }
399 }
400 return found;
401 }
402
403 /**
404 * Delete a suffix searched in zone "source" if preceded by the prefix
405 *
406 * @param source java.lang.String - the primary source zone for search
407 * @param search java.lang.String[] - the strings to search for suppression
408 * @param prefix java.lang.String - the prefix to add to the search string to test
409 * @param without boolean - true if it will be deleted even without prefix found
410 */
411 private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
412 if (source!=null)
413 {
414 for (int i = 0; i < search.length; i++) {
415 if ( source.endsWith( prefix + search[i] ))
416 {
417 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
418 modified = true;
419 setStrings();
420 break;
421 }
422 else if ( without && source.endsWith( search[i] ))
423 {
424 sb.delete( sb.length() - search[i].length(), sb.length() );
425 modified = true;
426 setStrings();
427 break;
428 }
429 }
430 }
431 }
432
433 /**
434 * Delete a suffix searched in zone "source" if preceded by prefix<br>
435 * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
436 * or delete the suffix if specified
437 *
438 * @param source java.lang.String - the primary source zone for search
439 * @param search java.lang.String[] - the strings to search for suppression
440 * @param prefix java.lang.String - the prefix to add to the search string to test
441 * @param without boolean - true if it will be deleted even without prefix found
442 */
443 private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
444 if (source!=null)
445 {
446 for (int i = 0; i < search.length; i++) {
447 if ( source.endsWith( prefix + search[i] ))
448 {
449 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
450 modified = true;
451 setStrings();
452 break;
453 }
454 else if ( from!=null && from.endsWith( prefix + search[i] ))
455 {
456 sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
457 modified = true;
458 setStrings();
459 break;
460 }
461 else if ( without && source.endsWith( search[i] ))
462 {
463 sb.delete( sb.length() - search[i].length(), sb.length() );
464 modified = true;
465 setStrings();
466 break;
467 }
468 }
469 }
470 }
471
472 /**
473 * Replace a search string with another within the source zone
474 *
475 * @param source java.lang.String - the source zone for search
476 * @param search java.lang.String[] - the strings to search for replacement
477 * @param replace java.lang.String - the replacement string
478 */
479 private boolean replaceFrom( String source, String[] search, String replace ) {
480 boolean found = false;
481 if (source!=null)
482 {
483 for (int i = 0; i < search.length; i++) {
484 if ( source.endsWith( search[i] ))
485 {
486 sb.replace( sb.length() - search[i].length(), sb.length(), replace );
487 modified = true;
488 found = true;
489 setStrings();
490 break;
491 }
492 }
493 }
494 return found;
495 }
496
497 /**
498 * Delete a search string within the source zone
499 *
500 * @param source the source zone for search
501 * @param suffix the strings to search for suppression
502 */
503 private void deleteFrom(String source, String[] suffix ) {
504 if (source!=null)
505 {
506 for (int i = 0; i < suffix.length; i++) {
507 if (source.endsWith( suffix[i] ))
508 {
509 sb.delete( sb.length() - suffix[i].length(), sb.length());
510 modified = true;
511 setStrings();
512 break;
513 }
514 }
515 }
516 }
517
518 /**
519 * Test if a char is a french vowel, including accentuated ones
520 *
521 * @param ch the char to test
522 * @return boolean - true if the char is a vowel
523 */
524 private boolean isVowel(char ch) {
525 switch (ch)
526 {
527 case 'a':
528 case 'e':
529 case 'i':
530 case 'o':
531 case 'u':
532 case 'y':
533 case 'â':
534 case 'à':
535 case 'ë':
536 case 'é':
537 case 'ê':
538 case 'è':
539 case 'ï':
540 case 'î':
541 case 'ô':
542 case 'ü':
543 case 'ù':
544 case 'û':
545 return true;
546 default:
547 return false;
548 }
549 }
550
551 /**
552 * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
553 * "R is the region after the first non-vowel following a vowel
554 * or is the null region at the end of the word if there is no such non-vowel"<br>
555 * @param buffer java.lang.StringBuffer - the in buffer
556 * @return java.lang.String - the resulting string
557 */
558 private String retrieveR( StringBuffer buffer ) {
559 int len = buffer.length();
560 int pos = -1;
561 for (int c = 0; c < len; c++) {
562 if (isVowel( buffer.charAt( c )))
563 {
564 pos = c;
565 break;
566 }
567 }
568 if (pos > -1)
569 {
570 int consonne = -1;
571 for (int c = pos; c < len; c++) {
572 if (!isVowel(buffer.charAt( c )))
573 {
574 consonne = c;
575 break;
576 }
577 }
578 if (consonne > -1 && (consonne+1) < len)
579 return buffer.substring( consonne+1, len );
580 else
581 return null;
582 }
583 else
584 return null;
585 }
586
587 /**
588 * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
589 * "If the word begins with two vowels, RV is the region after the third letter,
590 * otherwise the region after the first vowel not at the beginning of the word,
591 * or the end of the word if these positions cannot be found."<br>
592 * @param buffer java.lang.StringBuffer - the in buffer
593 * @return java.lang.String - the resulting string
594 */
595 private String retrieveRV( StringBuffer buffer ) {
596 int len = buffer.length();
597 if ( buffer.length() > 3)
598 {
599 if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
600 return buffer.substring(3,len);
601 }
602 else
603 {
604 int pos = 0;
605 for (int c = 1; c < len; c++) {
606 if (isVowel( buffer.charAt( c )))
607 {
608 pos = c;
609 break;
610 }
611 }
612 if ( pos+1 < len )
613 return buffer.substring( pos+1, len );
614 else
615 return null;
616 }
617 }
618 else
619 return null;
620 }
621
622
623
624 /**
625 * Turns u and i preceded AND followed by a vowel to UpperCase<br>
626 * Turns y preceded OR followed by a vowel to UpperCase<br>
627 * Turns u preceded by q to UpperCase<br>
628 *
629 * @param buffer java.util.StringBuffer - the buffer to treat
630 * @return java.util.StringBuffer - the treated buffer
631 */
632 private StringBuffer treatVowels( StringBuffer buffer ) {
633 for ( int c = 0; c < buffer.length(); c++ ) {
634 char ch = buffer.charAt( c );
635
636 if (c == 0) // first char
637 {
638 if (buffer.length()>1)
639 {
640 if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
641 buffer.setCharAt( c, 'Y' );
642 }
643 }
644 else if (c == buffer.length()-1) // last char
645 {
646 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
647 buffer.setCharAt( c, 'U' );
648 if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
649 buffer.setCharAt( c, 'Y' );
650 }
651 else // other cases
652 {
653 if (ch == 'u')
654 {
655 if (buffer.charAt( c - 1) == 'q')
656 buffer.setCharAt( c, 'U' );
657 else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
658 buffer.setCharAt( c, 'U' );
659 }
660 if (ch == 'i')
661 {
662 if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
663 buffer.setCharAt( c, 'I' );
664 }
665 if (ch == 'y')
666 {
667 if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
668 buffer.setCharAt( c, 'Y' );
669 }
670 }
671 }
672
673 return buffer;
674 }
675
676 /**
677 * Checks a term if it can be processed correctly.
678 *
679 * @return boolean - true if, and only if, the given term consists in letters.
680 */
681 private boolean isStemmable( String term ) {
682 boolean upper = false;
683 int first = -1;
684 for ( int c = 0; c < term.length(); c++ ) {
685 // Discard terms that contain non-letter characters.
686 if ( !Character.isLetter( term.charAt( c ) ) ) {
687 return false;
688 }
689 // Discard terms that contain multiple uppercase letters.
690 if ( Character.isUpperCase( term.charAt( c ) ) ) {
691 if ( upper ) {
692 return false;
693 }
694 // First encountered uppercase letter, set flag and save
695 // position.
696 else {
697 first = c;
698 upper = true;
699 }
700 }
701 }
702 // Discard the term if it contains a single uppercase letter that
703 // is not starting the term.
704 if ( first > 0 ) {
705 return false;
706 }
707 return true;
708 }
709 }