Mercurial > hg > LGDataverses
diff src/main/java/lia/analysis/positional/PositionalPorterStopAnalyzer.java @ 0:fcb8807fbd84
Fist commit!
| author | "jurzua <jurzua@mpiwg-berlin.mpg.de>" |
|---|---|
| date | Tue, 10 Mar 2015 15:15:30 +0100 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/lia/analysis/positional/PositionalPorterStopAnalyzer.java Tue Mar 10 15:15:30 2015 +0100 @@ -0,0 +1,50 @@ +/* + Copyright (C) 2005-2012, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package lia.analysis.positional; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.Reader; +import java.util.Set; + +public class PositionalPorterStopAnalyzer extends Analyzer { + private Set stopWords; + + public PositionalPorterStopAnalyzer() { + this(StopAnalyzer.ENGLISH_STOP_WORDS_SET); + } + + public PositionalPorterStopAnalyzer(Set stopWords) { + this.stopWords = stopWords; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + StopFilter stopFilter = new StopFilter(true, + new LowerCaseTokenizer(reader), + stopWords); + stopFilter.setEnablePositionIncrements(true); + return new PorterStemFilter(stopFilter); + } +}
