view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children 257f67be5c00
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;

public class MpdlTokenizerAnalyzer extends Analyzer {
  protected String language = MpdlConstants.DEFAULT_LANGUAGE;
  protected MpdlNormalizer normalizer = null;  
  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
  
  public MpdlTokenizerAnalyzer(String language) {
    this.language = language;
    this.normalizer = new MpdlNormalizer(language);  // default normalizer
  }

  public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) {
    this.language = language;
    this.normalizer = normalizer;
  }

  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
    this.regWithoutSemicolon = regWithoutSemicolon;  
  }
  
  public boolean isRegWithoutSemicolon() {
    return regWithoutSemicolon;  
  }
  
  public TokenStream tokenStream(String fieldName, Reader reader) {
    MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
    tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
    TokenStream result = (TokenStream) tmpTokenizer;
    result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
    result = new LowerCaseFilter(result);
    return result;
  }
  
  public ArrayList<Token> getToken(String inputString) throws ApplicationException {
    ArrayList<Token> token = new ArrayList<Token>();
    try {
      Reader reader = new StringReader(inputString);
      MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
      tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon);  // hack: feel free to remove it later
      TokenStream result = (TokenStream) tmpTokenizer;
      result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
      result = new LowerCaseFilter(result);
      Token t = result.next();
      while (t != null) {
        token.add(t);
        t = result.next();
      }
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return token;
  }
  
}