diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,52 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+import java.util.Set;
+
+public final class MpdlStemFilter extends TokenFilter {
+  private MpdlMorphAnalyzer analyzer;
+  private Token token = null;
+  private MpdlStemmer stemmer = null;
+  private Set exclusionSet = null;
+
+  public MpdlStemFilter(TokenStream in) {
+    super(in);
+    stemmer = new MpdlStemmer();
+  }
+
+  public MpdlStemFilter(MpdlMorphAnalyzer analyzer, TokenStream in, Set exclusionSet) {
+    this(in);
+    this.analyzer = analyzer;
+    this.exclusionSet = exclusionSet;
+    this.stemmer.setLanguage(analyzer.getLanguage());
+  }
+
+  public final Token next() throws IOException {
+    if (( token = input.next()) == null) {
+      return null;
+    } else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
+      return token;
+    } else {
+      String s = stemmer.stem(token.termText());
+      // If not stemmed, dont waste the time creating a new token
+      if ( !s.equals( token.termText() ) ) {
+        return new Token( s, token.startOffset(), token.endOffset(), token.type() );
+      }
+      return token;
+    }
+  }
+
+  public void setStemmer(MpdlStemmer stemmer) {
+    if ( stemmer != null ) {
+      this.stemmer = stemmer;
+    }
+  }
+
+  public void setExclusionSet(Set exclusionSet) {
+    this.exclusionSet = exclusionSet;
+  }
+}