Mercurial > hg > mpdl-group

diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/GermanStemmer.java @ 0:408254cf2f1d
Erstellung
author: Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date: Wed, 24 Nov 2010 17:24:23 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/lang/GermanStemmer.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,267 @@
+package de.mpg.mpiwg.berlin.mpdl.donatus.analysis.lang;
+// This file is encoded in UTF-8
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A stemmer for German words. The algorithm is based on the report
+ * "A Fast and Simple Stemming Algorithm for German Words" by J&ouml;rg
+ * Caumanns (joerg.caumanns at isst.fhg.de).
+ *
+ *
+ * @version   $Id: GermanStemmer.java 564236 2007-08-09 15:21:19Z gsingers $
+ */
+public class GermanStemmer
+{
+    /**
+     * Buffer for the terms while stemming them.
+     */
+    private StringBuffer sb = new StringBuffer();
+
+    /**
+     * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
+     */
+    private int substCount = 0;
+
+    /**
+     * Stemms the given term to an unique <tt>discriminator</tt>.
+     *
+     * @param term  The term that should be stemmed.
+     * @return      Discriminator for <tt>term</tt>
+     */
+    public String stem( String term )
+    {
+      // Use lowercase for medium stemming.
+      term = term.toLowerCase();
+      if ( !isStemmable( term ) )
+        return term;
+      // Reset the StringBuffer.
+      sb.delete( 0, sb.length() );
+      sb.insert( 0, term );
+      // Stemming starts here...
+      substitute( sb );
+      strip( sb );
+      optimize( sb );
+      resubstitute( sb );
+      removeParticleDenotion( sb );
+      return sb.toString();
+    }
+
+    /**
+     * Checks if a term could be stemmed.
+     *
+     * @return  true if, and only if, the given term consists in letters.
+     */
+    private boolean isStemmable( String term )
+    {
+      for ( int c = 0; c < term.length(); c++ ) {
+        if ( !Character.isLetter( term.charAt( c ) ) )
+          return false;
+      }
+      return true;
+    }
+
+    /**
+     * suffix stripping (stemming) on the current term. The stripping is reduced
+     * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+     * from which all regular suffixes are build of. The simplification causes
+     * some overstemming, and way more irregular stems, but still provides unique.
+     * discriminators in the most of those cases.
+     * The algorithm is context free, except of the length restrictions.
+     */
+    private void strip( StringBuffer buffer )
+    {
+      boolean doMore = true;
+      while ( doMore && buffer.length() > 3 ) {
+        if ( ( buffer.length() + substCount > 5 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
+        {
+          buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
+            buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
+            buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        // "t" occurs only as suffix of verbs.
+        else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else {
+          doMore = false;
+        }
+      }
+    }
+
+    /**
+     * Does some optimizations on the term. This optimisations are
+     * contextual.
+     */
+    private void optimize( StringBuffer buffer )
+    {
+      // Additional step for female plurals of professions and inhabitants.
+      if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
+        buffer.deleteCharAt( buffer.length() -1 );
+        strip( buffer );
+      }
+      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+      if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
+        buffer.setCharAt( buffer.length() - 1, 'x' );
+      }
+    }
+
+    /**
+     * Removes a particle denotion ("ge") from a term.
+     */
+    private void removeParticleDenotion( StringBuffer buffer )
+    {
+      if ( buffer.length() > 4 ) {
+        for ( int c = 0; c < buffer.length() - 3; c++ ) {
+          if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
+            buffer.delete( c, c + 2 );
+            return;
+          }
+        }
+      }
+    }
+
+    /**
+     * Do some substitutions for the term to reduce overstemming:
+     *
+     * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+     *   "ß" is substituted by "ss"
+     * - Substitute a second char of a pair of equal characters with
+     *   an asterisk: ?? -> ?*
+     * - Substitute some common character combinations with a token:
+     *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
+     */
+    private void substitute( StringBuffer buffer )
+    {
+      substCount = 0;
+      for ( int c = 0; c < buffer.length(); c++ ) {
+        // Replace the second char of a pair of the equal characters with an asterisk
+        if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
+          buffer.setCharAt( c, '*' );
+        }
+        // Substitute Umlauts.
+        else if ( buffer.charAt( c ) == 'ä' ) {
+          buffer.setCharAt( c, 'a' );
+        }
+        else if ( buffer.charAt( c ) == 'ö' ) {
+          buffer.setCharAt( c, 'o' );
+        }
+        else if ( buffer.charAt( c ) == 'ü' ) {
+          buffer.setCharAt( c, 'u' );
+        }
+        // Fix bug so that 'ß' at the end of a word is replaced.
+        else if ( buffer.charAt( c ) == 'ß' ) {
+            buffer.setCharAt( c, 's' );
+            buffer.insert( c + 1, 's' );
+            substCount++;
+        }
+        // Take care that at least one character is left left side from the current one
+        if ( c < buffer.length() - 1 ) {
+          // Masking several common character combinations with an token
+          if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
+            buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
+          {
+            buffer.setCharAt( c, '$' );
+            buffer.delete( c + 1, c + 3 );
+            substCount =+ 2;
+          }
+          else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
+            buffer.setCharAt( c, '§' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
+            buffer.setCharAt( c, '%' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
+            buffer.setCharAt( c, '&' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
+            buffer.setCharAt( c, '#' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
+            buffer.setCharAt( c, '!' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+        }
+      }
+    }
+
+    /**
+     * Undoes the changes made by substitute(). That are character pairs and
+     * character combinations. Umlauts will remain as their corresponding vowel,
+     * as "ß" remains as "ss".
+     */
+    private void resubstitute( StringBuffer buffer )
+    {
+      for ( int c = 0; c < buffer.length(); c++ ) {
+        if ( buffer.charAt( c ) == '*' ) {
+          char x = buffer.charAt( c - 1 );
+          buffer.setCharAt( c, x );
+        }
+        else if ( buffer.charAt( c ) == '$' ) {
+          buffer.setCharAt( c, 's' );
+          buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
+        }
+        else if ( buffer.charAt( c ) == '§' ) {
+          buffer.setCharAt( c, 'c' );
+          buffer.insert( c + 1, 'h' );
+        }
+        else if ( buffer.charAt( c ) == '%' ) {
+          buffer.setCharAt( c, 'e' );
+          buffer.insert( c + 1, 'i' );
+        }
+        else if ( buffer.charAt( c ) == '&' ) {
+          buffer.setCharAt( c, 'i' );
+          buffer.insert( c + 1, 'e' );
+        }
+        else if ( buffer.charAt( c ) == '#' ) {
+          buffer.setCharAt( c, 'i' );
+          buffer.insert( c + 1, 'g' );
+        }
+        else if ( buffer.charAt( c ) == '!' ) {
+          buffer.setCharAt( c, 's' );
+          buffer.insert( c + 1, 't' );
+        }
+      }
+    }
+    
+}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 24 Nov 2010 17:24:23 +0100
parents
children