annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import org.apache.lucene.analysis.*;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import org.apache.lucene.analysis.standard.StandardFilter;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import org.apache.lucene.analysis.standard.StandardTokenizer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 import java.io.File;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import java.io.IOException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import java.io.Reader;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import java.util.Set;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 /**
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 * and reusableTokenStream)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 *
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 public class MpdlStandardAnalyzer extends Analyzer {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 private Set stopSet;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 /** An array containing some common English words that are usually not
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 useful for searching. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 public MpdlStandardAnalyzer() {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 this(STOP_WORDS);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 /** Builds an analyzer with the given stop words. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 public MpdlStandardAnalyzer(Set stopWords) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 stopSet = stopWords;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 /** Builds an analyzer with the given stop words. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 public MpdlStandardAnalyzer(String[] stopWords) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 stopSet = StopFilter.makeStopSet(stopWords);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 /** Builds an analyzer with the stop words from the given file.
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 * @see WordlistLoader#getWordSet(File)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 public MpdlStandardAnalyzer(File stopwords) throws IOException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 stopSet = WordlistLoader.getWordSet(stopwords);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 /** Builds an analyzer with the stop words from the given reader.
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 * @see WordlistLoader#getWordSet(Reader)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 public MpdlStandardAnalyzer(Reader stopwords) throws IOException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 stopSet = WordlistLoader.getWordSet(stopwords);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 /** Constructs a {@link StandardTokenizer} filtered by a {@link
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 public TokenStream tokenStream(String fieldName, Reader reader) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 StandardTokenizer tokenStream = new StandardTokenizer(reader);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 tokenStream.setMaxTokenLength(maxTokenLength);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 TokenStream result = new StandardFilter(tokenStream);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 result = new StopFilter(result, stopSet);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 return result;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 private static final class SavedStreams {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 StandardTokenizer tokenStream;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 TokenStream filteredTokenStream;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 /** Default maximum allowed token length */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72 /**
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 * Set maximum allowed token length. If a token is seen
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 * that exceeds this length then it is discarded. This
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 * setting only takes effect the next time tokenStream or
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 * reusableTokenStream is called.
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 public void setMaxTokenLength(int length) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 maxTokenLength = length;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 /**
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 * @see #setMaxTokenLength
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 public int getMaxTokenLength() {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 return maxTokenLength;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 if (streams == null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 streams = new SavedStreams();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 setPreviousTokenStream(streams);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 streams.tokenStream = new StandardTokenizer(reader);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 streams.tokenStream.reset(reader);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 streams.tokenStream.setMaxTokenLength(maxTokenLength);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 return streams.filteredTokenStream;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 }