0
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
2
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
3 import org.apache.lucene.analysis.*;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
4 import org.apache.lucene.analysis.standard.StandardFilter;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
5 import org.apache.lucene.analysis.standard.StandardTokenizer;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
6
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
7 import java.io.File;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
8 import java.io.IOException;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
9 import java.io.Reader;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
10 import java.util.Set;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
11
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
12 /**
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
13 * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
14 * and reusableTokenStream)
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
15 *
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
16 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
17 public class MpdlStandardAnalyzer extends Analyzer {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
18 private Set stopSet;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
19 /** An array containing some common English words that are usually not
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
20 useful for searching. */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
21 public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
22
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
23 /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
24 public MpdlStandardAnalyzer() {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
25 this(STOP_WORDS);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
26 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
27
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
28 /** Builds an analyzer with the given stop words. */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
29 public MpdlStandardAnalyzer(Set stopWords) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
30 stopSet = stopWords;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
31 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
32
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
33 /** Builds an analyzer with the given stop words. */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
34 public MpdlStandardAnalyzer(String[] stopWords) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
35 stopSet = StopFilter.makeStopSet(stopWords);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
36 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
37
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
38 /** Builds an analyzer with the stop words from the given file.
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
39 * @see WordlistLoader#getWordSet(File)
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
40 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
41 public MpdlStandardAnalyzer(File stopwords) throws IOException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
42 stopSet = WordlistLoader.getWordSet(stopwords);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
43 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
44
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
45 /** Builds an analyzer with the stop words from the given reader.
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
46 * @see WordlistLoader#getWordSet(Reader)
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
47 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
48 public MpdlStandardAnalyzer(Reader stopwords) throws IOException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
49 stopSet = WordlistLoader.getWordSet(stopwords);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
50 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
51
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
52 /** Constructs a {@link StandardTokenizer} filtered by a {@link
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
53 StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
54 public TokenStream tokenStream(String fieldName, Reader reader) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
55 StandardTokenizer tokenStream = new StandardTokenizer(reader);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
56 tokenStream.setMaxTokenLength(maxTokenLength);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
57 TokenStream result = new StandardFilter(tokenStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
58 result = new StopFilter(result, stopSet);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
59 return result;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
60 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
61
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
62 private static final class SavedStreams {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
63 StandardTokenizer tokenStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
64 TokenStream filteredTokenStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
65 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
66
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
67 /** Default maximum allowed token length */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
68 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
69
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
70 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
71
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
72 /**
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
73 * Set maximum allowed token length. If a token is seen
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
74 * that exceeds this length then it is discarded. This
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
75 * setting only takes effect the next time tokenStream or
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
76 * reusableTokenStream is called.
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
77 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
78 public void setMaxTokenLength(int length) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
79 maxTokenLength = length;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
80 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
81
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
82 /**
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
83 * @see #setMaxTokenLength
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
84 */
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
85 public int getMaxTokenLength() {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
86 return maxTokenLength;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
87 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
88
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
89 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
90 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
91 if (streams == null) {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
92 streams = new SavedStreams();
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
93 setPreviousTokenStream(streams);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
94 streams.tokenStream = new StandardTokenizer(reader);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
95 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
96 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
97 } else {
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
98 streams.tokenStream.reset(reader);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
99 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
100 streams.tokenStream.setMaxTokenLength(maxTokenLength);
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
101 return streams.filteredTokenStream;
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
102 }
|
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff
changeset
|
103 } |