Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/analysis/MpdlStandardAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/analysis/MpdlStandardAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,103 @@ +package de.mpg.mpiwg.berlin.mpdl.analysis; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +/** + * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream + * and reusableTokenStream) + * + */ +public class MpdlStandardAnalyzer extends Analyzer { + private Set stopSet; + /** An array containing some common English words that are usually not + useful for searching. */ + public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; + + /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */ + public MpdlStandardAnalyzer() { + this(STOP_WORDS); + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(Set stopWords) { + stopSet = stopWords; + } + + /** Builds an analyzer with the given stop words. */ + public MpdlStandardAnalyzer(String[] stopWords) { + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** Builds an analyzer with the stop words from the given file. + * @see WordlistLoader#getWordSet(File) + */ + public MpdlStandardAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Builds an analyzer with the stop words from the given reader. + * @see WordlistLoader#getWordSet(Reader) + */ + public MpdlStandardAnalyzer(Reader stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Constructs a {@link StandardTokenizer} filtered by a {@link + StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */ + public TokenStream tokenStream(String fieldName, Reader reader) { + StandardTokenizer tokenStream = new StandardTokenizer(reader); + tokenStream.setMaxTokenLength(maxTokenLength); + TokenStream result = new StandardFilter(tokenStream); + result = new StopFilter(result, stopSet); + return result; + } + + private static final class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + } + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * reusableTokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new StandardTokenizer(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else { + streams.tokenStream.reset(reader); + } + streams.tokenStream.setMaxTokenLength(maxTokenLength); + return streams.filteredTokenStream; + } +} \ No newline at end of file