Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Set; /** * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream * and reusableTokenStream) * */ public class MpdlStandardAnalyzer extends Analyzer { private Set stopSet; /** An array containing some common English words that are usually not useful for searching. */ public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */ public MpdlStandardAnalyzer() { this(STOP_WORDS); } /** Builds an analyzer with the given stop words. */ public MpdlStandardAnalyzer(Set stopWords) { stopSet = stopWords; } /** Builds an analyzer with the given stop words. */ public MpdlStandardAnalyzer(String[] stopWords) { stopSet = StopFilter.makeStopSet(stopWords); } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) */ public MpdlStandardAnalyzer(File stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) */ public MpdlStandardAnalyzer(Reader stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); } /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { StandardTokenizer tokenStream = new StandardTokenizer(reader); tokenStream.setMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new StopFilter(result, stopSet); return result; } private static final class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Set maximum allowed token length. If a token is seen * that exceeds this length then it is discarded. This * setting only takes effect the next time tokenStream or * reusableTokenStream is called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } /** * @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); return streams.filteredTokenStream; } }