Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemFilter.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,52 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.util.Set; + +public final class MpdlStemFilter extends TokenFilter { + private MpdlMorphAnalyzer analyzer; + private Token token = null; + private MpdlStemmer stemmer = null; + private Set exclusionSet = null; + + public MpdlStemFilter(TokenStream in) { + super(in); + stemmer = new MpdlStemmer(); + } + + public MpdlStemFilter(MpdlMorphAnalyzer analyzer, TokenStream in, Set exclusionSet) { + this(in); + this.analyzer = analyzer; + this.exclusionSet = exclusionSet; + this.stemmer.setLanguage(analyzer.getLanguage()); + } + + public final Token next() throws IOException { + if (( token = input.next()) == null) { + return null; + } else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + return token; + } else { + String s = stemmer.stem(token.termText()); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), token.endOffset(), token.type() ); + } + return token; + } + } + + public void setStemmer(MpdlStemmer stemmer) { + if ( stemmer != null ) { + this.stemmer = stemmer; + } + } + + public void setExclusionSet(Set exclusionSet) { + this.exclusionSet = exclusionSet; + } +}