diff src/main/java/lia/analysis/positional/PositionalPorterStopAnalyzer.java @ 0:fcb8807fbd84

Fist commit!
author "jurzua <jurzua@mpiwg-berlin.mpg.de>"
date Tue, 10 Mar 2015 15:15:30 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/lia/analysis/positional/PositionalPorterStopAnalyzer.java	Tue Mar 10 15:15:30 2015 +0100
@@ -0,0 +1,50 @@
+/*
+   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Dataverse Network - A web application to share, preserve and analyze research data.
+   Developed at the Institute for Quantitative Social Science, Harvard University.
+   Version 3.0.
+*/
+package lia.analysis.positional;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.Reader;
+import java.util.Set;
+
+public class PositionalPorterStopAnalyzer extends Analyzer {
+  private Set stopWords;
+
+  public PositionalPorterStopAnalyzer() {
+    this(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+  }
+
+  public PositionalPorterStopAnalyzer(Set stopWords) {
+    this.stopWords = stopWords;
+  }
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    StopFilter stopFilter = new StopFilter(true,
+                                           new LowerCaseTokenizer(reader),
+                                           stopWords);
+    stopFilter.setEnablePositionIncrements(true);
+    return new PorterStemFilter(stopFilter);
+  }
+}