Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStandardAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; | |
2 | |
3 import org.apache.lucene.analysis.*; | |
4 import org.apache.lucene.analysis.standard.StandardFilter; | |
5 import org.apache.lucene.analysis.standard.StandardTokenizer; | |
6 | |
7 import java.io.File; | |
8 import java.io.IOException; | |
9 import java.io.Reader; | |
10 import java.util.Set; | |
11 | |
12 /** | |
13 * StandardAnalyzer which is case insensitive (no LowerCaseFilter in method tokenStream | |
14 * and reusableTokenStream) | |
15 * | |
16 */ | |
17 public class MpdlStandardAnalyzer extends Analyzer { | |
18 private Set stopSet; | |
19 /** An array containing some common English words that are usually not | |
20 useful for searching. */ | |
21 public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; | |
22 | |
23 /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */ | |
24 public MpdlStandardAnalyzer() { | |
25 this(STOP_WORDS); | |
26 } | |
27 | |
28 /** Builds an analyzer with the given stop words. */ | |
29 public MpdlStandardAnalyzer(Set stopWords) { | |
30 stopSet = stopWords; | |
31 } | |
32 | |
33 /** Builds an analyzer with the given stop words. */ | |
34 public MpdlStandardAnalyzer(String[] stopWords) { | |
35 stopSet = StopFilter.makeStopSet(stopWords); | |
36 } | |
37 | |
38 /** Builds an analyzer with the stop words from the given file. | |
39 * @see WordlistLoader#getWordSet(File) | |
40 */ | |
41 public MpdlStandardAnalyzer(File stopwords) throws IOException { | |
42 stopSet = WordlistLoader.getWordSet(stopwords); | |
43 } | |
44 | |
45 /** Builds an analyzer with the stop words from the given reader. | |
46 * @see WordlistLoader#getWordSet(Reader) | |
47 */ | |
48 public MpdlStandardAnalyzer(Reader stopwords) throws IOException { | |
49 stopSet = WordlistLoader.getWordSet(stopwords); | |
50 } | |
51 | |
52 /** Constructs a {@link StandardTokenizer} filtered by a {@link | |
53 StandardFilter}, not a {@link LowerCaseFilter} and a {@link StopFilter}. */ | |
54 public TokenStream tokenStream(String fieldName, Reader reader) { | |
55 StandardTokenizer tokenStream = new StandardTokenizer(reader); | |
56 tokenStream.setMaxTokenLength(maxTokenLength); | |
57 TokenStream result = new StandardFilter(tokenStream); | |
58 result = new StopFilter(result, stopSet); | |
59 return result; | |
60 } | |
61 | |
62 private static final class SavedStreams { | |
63 StandardTokenizer tokenStream; | |
64 TokenStream filteredTokenStream; | |
65 } | |
66 | |
67 /** Default maximum allowed token length */ | |
68 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; | |
69 | |
70 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; | |
71 | |
72 /** | |
73 * Set maximum allowed token length. If a token is seen | |
74 * that exceeds this length then it is discarded. This | |
75 * setting only takes effect the next time tokenStream or | |
76 * reusableTokenStream is called. | |
77 */ | |
78 public void setMaxTokenLength(int length) { | |
79 maxTokenLength = length; | |
80 } | |
81 | |
82 /** | |
83 * @see #setMaxTokenLength | |
84 */ | |
85 public int getMaxTokenLength() { | |
86 return maxTokenLength; | |
87 } | |
88 | |
89 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { | |
90 SavedStreams streams = (SavedStreams) getPreviousTokenStream(); | |
91 if (streams == null) { | |
92 streams = new SavedStreams(); | |
93 setPreviousTokenStream(streams); | |
94 streams.tokenStream = new StandardTokenizer(reader); | |
95 streams.filteredTokenStream = new StandardFilter(streams.tokenStream); | |
96 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); | |
97 } else { | |
98 streams.tokenStream.reset(reader); | |
99 } | |
100 streams.tokenStream.setMaxTokenLength(maxTokenLength); | |
101 return streams.filteredTokenStream; | |
102 } | |
103 } |