view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

import java.io.IOException;
import java.util.Set;

public final class MpdlStemFilter extends TokenFilter {
  private MpdlMorphAnalyzer analyzer;
  private Token token = null;
  private MpdlStemmer stemmer = null;
  private Set exclusionSet = null;

  public MpdlStemFilter(TokenStream in) {
    super(in);
    stemmer = new MpdlStemmer();
  }

  public MpdlStemFilter(MpdlMorphAnalyzer analyzer, TokenStream in, Set exclusionSet) {
    this(in);
    this.analyzer = analyzer;
    this.exclusionSet = exclusionSet;
    this.stemmer.setLanguage(analyzer.getLanguage());
  }

  public final Token next() throws IOException {
    if (( token = input.next()) == null) {
      return null;
    } else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
      return token;
    } else {
      String s = stemmer.stem(token.termText());
      // If not stemmed, dont waste the time creating a new token
      if ( !s.equals( token.termText() ) ) {
        return new Token( s, token.startOffset(), token.endOffset(), token.type() );
      }
      return token;
    }
  }

  public void setStemmer(MpdlStemmer stemmer) {
    if ( stemmer != null ) {
      this.stemmer = stemmer;
    }
  }

  public void setExclusionSet(Set exclusionSet) {
    this.exclusionSet = exclusionSet;
  }
}