Mercurial > hg > mpdl-group

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

public class MpdlTokenizer extends Tokenizer {
  private static final int MAX_WORD_LEN = 255;
  private static final int IO_BUFFER_SIZE = 1024;
  private String language;  // TODO make the tokenizer language dependent
  private int offset = 0, bufferIndex = 0, dataLen = 0;
  private char[] buffer = new char[MAX_WORD_LEN];
  private char[] ioBuffer = new char[IO_BUFFER_SIZE];
  private MpdlNormalizer normalizer;

  public MpdlTokenizer(Reader input, String language) {
    super(input);
    this.language = language;
  }

  public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
    super(input);
    this.language = language;
    this.normalizer = normalizer;
  }

  /** Returns true iff a character should be included in a token.  This
   * tokenizer generates as tokens adjacent sequences of characters which
   * satisfy this predicate.  Characters for which this is false are used to
   * define token boundaries and are not included in tokens. */
  protected boolean isTokenChar(char c) {
    boolean isTokenChar = true;
    switch (c) {
      case ' ': isTokenChar = false; break;
      case '.': isTokenChar = false; break;
      case ',': isTokenChar = false; break;
      case '!': isTokenChar = false; break;
      case '?': isTokenChar = false; break;
      case ';': isTokenChar = false; break;
      case ':': isTokenChar = false; break;
      case '(': isTokenChar = false; break;
      case ')': isTokenChar = false; break;
      case '[': isTokenChar = false; break;
      case ']': isTokenChar = false; break;
      case '<': isTokenChar = false; break;
      case '>': isTokenChar = false; break;
      case '&': isTokenChar = false; break;
      case '+': isTokenChar = false; break;
      case '"': isTokenChar = false; break;
      case '\'': isTokenChar = false; break;
      // case '\t': isTokenChar = false; break;
      // case '\n': isTokenChar = false; break;  // do not break words which are on another line
    }
    return isTokenChar;
  }

  /** Called on each token character to normalize it before it is added to the
   * token.  The default implementation does nothing.  Subclasses may use this
   * to, e.g., lowercase tokens. */
  protected char normalize(char c) {
    return c;
  }

  /** Returns the next token in the stream, or null at EOS. */
  public final Token next() throws IOException {
    int length = 0;
    int start = offset;
    while (true) {
      final char c;
      offset++;
      if (bufferIndex >= dataLen) {
        dataLen = input.read(ioBuffer);
        bufferIndex = 0;
      }
      if (dataLen == -1) {
        if (length > 0)
          break;
        else
          return null;
      } else {
        c = ioBuffer[bufferIndex++];
      }
      if (isTokenChar(c)) {              // if it's a token char
        if (length == 0)                 // start of token
          start = offset - 1;
        buffer[length++] = normalize(c); // buffer it, normalized
        if (length == MAX_WORD_LEN)      // buffer overflow!
          break;
      } else if (length > 0)             // at non-Letter w/ chars
        break;                           // return 'em
    }
    Token newToken = new Token(start, start + length);
    newToken.setTermBuffer(buffer, 0, length);
    if (normalizer != null) {
      char[] termBuffer = newToken.termBuffer();
      int termBufferLength = newToken.termLength();
      String tokenText = new String(termBuffer, 0, termBufferLength);
      try {
        String normalizedTokenText = normalizer.normalize(tokenText);
        int normalizedTokenTextLength = normalizedTokenText.length();
        char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
        newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
      } catch (ApplicationException e) {
        throw new IOException(e);
      }
    }
    return newToken;
  }
}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 24 Nov 2010 17:24:23 +0100
parents
children	2396a569e446