diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,55 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+
+public class MpdlTokenizerAnalyzer extends Analyzer {
+  protected String language = MpdlConstants.DEFAULT_LANGUAGE;
+  protected MpdlNormalizer normalizer = null;  
+  
+  public MpdlTokenizerAnalyzer(String language) {
+    this.language = language;
+    this.normalizer = new MpdlNormalizer(language);  // default normalizer
+  }
+
+  public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) {
+    this.language = language;
+    this.normalizer = normalizer;
+  }
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new MpdlTokenizer(reader, language, normalizer);
+    result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
+    result = new LowerCaseFilter(result);
+    return result;
+  }
+  
+  public ArrayList<Token> getToken(String inputString) throws ApplicationException {
+    ArrayList<Token> token = new ArrayList<Token>();
+    try {
+      Reader reader = new StringReader(inputString);
+      TokenStream result = new MpdlTokenizer(reader, language, normalizer);
+      result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
+      result = new LowerCaseFilter(result);
+      Token t = result.next();
+      while (t != null) {
+        token.add(t);
+        t = result.next();
+      }
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return token;
+  }
+  
+}