annotate src/main/java/lia/analysis/positional/PositionalPorterStopAnalyzer.java @ 0:fcb8807fbd84

Fist commit!
author "jurzua <jurzua@mpiwg-berlin.mpg.de>"
date Tue, 10 Mar 2015 15:15:30 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
1 /*
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
2 Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
3
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
4 Licensed under the Apache License, Version 2.0 (the "License");
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
5 you may not use this file except in compliance with the License.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
6 You may obtain a copy of the License at
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
7
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
8 http://www.apache.org/licenses/LICENSE-2.0
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
9
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
10 Unless required by applicable law or agreed to in writing, software
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
11 distributed under the License is distributed on an "AS IS" BASIS,
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
13 See the License for the specific language governing permissions and
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
14 limitations under the License.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
15
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
16 Dataverse Network - A web application to share, preserve and analyze research data.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
17 Developed at the Institute for Quantitative Social Science, Harvard University.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
18 Version 3.0.
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
19 */
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
20 package lia.analysis.positional;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
21
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
22 import org.apache.lucene.analysis.Analyzer;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
23 import org.apache.lucene.analysis.LowerCaseTokenizer;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
24 import org.apache.lucene.analysis.PorterStemFilter;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
25 import org.apache.lucene.analysis.StopAnalyzer;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
26 import org.apache.lucene.analysis.StopFilter;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
27 import org.apache.lucene.analysis.TokenStream;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
28
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
29 import java.io.Reader;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
30 import java.util.Set;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
31
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
32 public class PositionalPorterStopAnalyzer extends Analyzer {
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
33 private Set stopWords;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
34
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
35 public PositionalPorterStopAnalyzer() {
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
36 this(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
37 }
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
38
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
39 public PositionalPorterStopAnalyzer(Set stopWords) {
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
40 this.stopWords = stopWords;
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
41 }
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
42
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
43 public TokenStream tokenStream(String fieldName, Reader reader) {
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
44 StopFilter stopFilter = new StopFilter(true,
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
45 new LowerCaseTokenizer(reader),
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
46 stopWords);
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
47 stopFilter.setEnablePositionIncrements(true);
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
48 return new PorterStemFilter(stopFilter);
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
49 }
fcb8807fbd84 Fist commit!
"jurzua <jurzua@mpiwg-berlin.mpg.de>"
parents:
diff changeset
50 }