comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
2
3 import org.apache.lucene.analysis.*;
4 import org.apache.lucene.analysis.standard.StandardFilter;
5 import org.apache.lucene.analysis.standard.StandardTokenizer;
6
7 import java.io.File;
8 import java.io.IOException;
9 import java.io.Reader;
10 import java.util.Set;
11
12 /**
13 * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream
14 * and reusableTokenStream)
15 *
16 */
17 public class MpdlStandardAnalyzer extends Analyzer {
18 private Set stopSet;
19 /** An array containing some common English words that are usually not
20 useful for searching. */
21 public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
22
23 /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
24 public MpdlStandardAnalyzer() {
25 this(STOP_WORDS);
26 }
27
28 /** Builds an analyzer with the given stop words. */
29 public MpdlStandardAnalyzer(Set stopWords) {
30 stopSet = stopWords;
31 }
32
33 /** Builds an analyzer with the given stop words. */
34 public MpdlStandardAnalyzer(String[] stopWords) {
35 stopSet = StopFilter.makeStopSet(stopWords);
36 }
37
38 /** Builds an analyzer with the stop words from the given file.
39 * @see WordlistLoader#getWordSet(File)
40 */
41 public MpdlStandardAnalyzer(File stopwords) throws IOException {
42 stopSet = WordlistLoader.getWordSet(stopwords);
43 }
44
45 /** Builds an analyzer with the stop words from the given reader.
46 * @see WordlistLoader#getWordSet(Reader)
47 */
48 public MpdlStandardAnalyzer(Reader stopwords) throws IOException {
49 stopSet = WordlistLoader.getWordSet(stopwords);
50 }
51
52 /** Constructs a {@link StandardTokenizer} filtered by a {@link
53 StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */
54 public TokenStream tokenStream(String fieldName, Reader reader) {
55 StandardTokenizer tokenStream = new StandardTokenizer(reader);
56 tokenStream.setMaxTokenLength(maxTokenLength);
57 TokenStream result = new StandardFilter(tokenStream);
58 result = new StopFilter(result, stopSet);
59 return result;
60 }
61
62 private static final class SavedStreams {
63 StandardTokenizer tokenStream;
64 TokenStream filteredTokenStream;
65 }
66
67 /** Default maximum allowed token length */
68 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
69
70 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
71
72 /**
73 * Set maximum allowed token length. If a token is seen
74 * that exceeds this length then it is discarded. This
75 * setting only takes effect the next time tokenStream or
76 * reusableTokenStream is called.
77 */
78 public void setMaxTokenLength(int length) {
79 maxTokenLength = length;
80 }
81
82 /**
83 * @see #setMaxTokenLength
84 */
85 public int getMaxTokenLength() {
86 return maxTokenLength;
87 }
88
89 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
90 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
91 if (streams == null) {
92 streams = new SavedStreams();
93 setPreviousTokenStream(streams);
94 streams.tokenStream = new StandardTokenizer(reader);
95 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
96 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
97 } else {
98 streams.tokenStream.reset(reader);
99 }
100 streams.tokenStream.setMaxTokenLength(maxTokenLength);
101 return streams.filteredTokenStream;
102 }
103 }