Mercurial > hg > mpdl-group

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

public class MpdlTokenizer extends Tokenizer {
  private static final int MAX_WORD_LEN = 255;
  private static final int IO_BUFFER_SIZE = 1024;
  private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
  private boolean isInNotWordDelimMode = false;
  private int offset = 0, bufferIndex = 0, dataLen = 0;
  private char[] buffer = new char[MAX_WORD_LEN];
  private char[] ioBuffer = new char[IO_BUFFER_SIZE];
  private MpdlNormalizer normalizer;
  private String language;

  public MpdlTokenizer(Reader input, String language) {
    super(input);
    this.language = language;
  }

  public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
    super(input);
    this.language = language;
    this.normalizer = normalizer;
  }

  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
    this.regWithoutSemicolon = regWithoutSemicolon;
  }

  public boolean isRegWithoutSemicolon() {
    return regWithoutSemicolon;
  }

  /** Returns true iff a character should be included in a token.  This
   * tokenizer generates as tokens adjacent sequences of characters which
   * satisfy this predicate.  Characters for which this is false are used to
   * define token boundaries and are not included in tokens. */
  protected boolean isTokenChar(char c) {
    boolean isTokenChar = true;
    if (isRegWithoutSemicolon() && c == ';')  // hack: special case for regularization and normalization; feel free to remove it later
      return true;
    switch (c) {
      case ' ': isTokenChar = false; break;
      case '.': isTokenChar = false; break;
      case ',': isTokenChar = false; break;
      case '!': isTokenChar = false; break;
      case '?': isTokenChar = false; break;
      case ';': isTokenChar = false; break;
      case ':': isTokenChar = false; break;
      case '(': isTokenChar = false; break;
      case ')': isTokenChar = false; break;
      case '[': isTokenChar = false; break;
      case ']': isTokenChar = false; break;
      case '<': isTokenChar = false; break;
      case '>': isTokenChar = false; break;
      case '&': isTokenChar = false; break;
      case '+': isTokenChar = false; break;
      case '"': isTokenChar = false; break;
      case '„': isTokenChar = false; break;
      case '“': isTokenChar = false; break;
      case '«': isTokenChar = false; break;
      case '»': isTokenChar = false; break;
      case '\'': isTokenChar = false; break;
      case '\t': isTokenChar = false; break; // do not break words which have tabs in it
      case '\n': isTokenChar = false; break;  // do not break words which are on another line
    }
    return isTokenChar;
  }

  protected boolean isTokenCharInNotWordDelimMode(char c) {
    boolean isTokenCharInNotWordDelimMode = false;
    if (isInNotWordDelimMode) {
      switch (c) {
        case ' ': isTokenCharInNotWordDelimMode = true; break;
        case '\t': isTokenCharInNotWordDelimMode = true; break;
        case '\n': isTokenCharInNotWordDelimMode = true; break;
      }
    }
    return isTokenCharInNotWordDelimMode;
  }

  protected boolean isSpecialNotWordDelimSymbol(char c) {
    boolean isSpecialNotWordDelimSymbol = false;
    switch (c) {
      case '\u2424': isSpecialNotWordDelimSymbol = true; break;  // unicode character for newline
    }
    return isSpecialNotWordDelimSymbol;
  }


  /** Called on each token character to normalize it before it is added to the
   * token.  The default implementation does nothing.  Subclasses may use this
   * to, e.g., lowercase tokens. */
  protected char normalize(char c) {
    return c;
  }

  /** Returns the next token in the stream, or null at EOS. */
  public final Token next() throws IOException {
    if (language != null && language.equals("zh"))
      return nextChinese();
    int length = 0;
    int start = offset;
    while (true) {
      final char c;
      offset++;
      if (bufferIndex >= dataLen) {
        dataLen = input.read(ioBuffer);
        bufferIndex = 0;
      }
      if (dataLen == -1) {
        if (length > 0)
          break;
        else
          return null;
      } else {
        c = ioBuffer[bufferIndex++];
      }
      if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) {
        isInNotWordDelimMode = false;
      }
      if (isSpecialNotWordDelimSymbol(c)) {
        isInNotWordDelimMode = true;
      }
      if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) {              // if it's a token char
        if (length == 0)                 // start of token
          start = offset - 1;
        buffer[length++] = normalize(c); // buffer it, normalized
        if (length == MAX_WORD_LEN)      // buffer overflow!
          break;
      } else if (length > 0)             // at non-Letter w/ chars
        break;                           // return 'em
    }
    isInNotWordDelimMode = false;
    Token newToken = new Token(start, start + length);
    newToken.setTermBuffer(buffer, 0, length);
    removeSpecialSymbols(newToken);  // remove some special symbols in token (e.g. symbol for word delimiting xml elements)
    if (normalizer != null) {
      char[] termBuffer = newToken.termBuffer();
      int termBufferLength = newToken.termLength();
      String tokenText = new String(termBuffer, 0, termBufferLength);
      try {
        String normalizedTokenText = normalizer.normalize(tokenText);
        int normalizedTokenTextLength = normalizedTokenText.length();
        char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
        newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
      } catch (ApplicationException e) {
        throw new IOException(e);
      }
    }
    return newToken;
  }

  private Token removeSpecialSymbols(Token token) {
    char[] termBuffer = token.termBuffer();
    int termBufferLength = token.termLength();
    String tokenText = new String(termBuffer, 0, termBufferLength);
    String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");  // a symbol which marks word delimiting xml elements
    int newTokenTextLength = newTokenText.length();
    char[] newTokenTextBuffer = newTokenText.toCharArray();
    token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength);
    return token;
  }


  /*
   * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer
   *
   */
  private int length;
  private int start;

  private final void push(char c) {
    if (length == 0) start = offset-1;            // start of token
    buffer[length++] = Character.toLowerCase(c);  // buffer it
  }

  private final Token flush() {
    if (length>0) {
      return new Token(new String(buffer, 0, length), start, start+length);
    }
    else
      return null;
  }

  public final Token nextChinese() throws IOException {
    length = 0;
    start = offset;
    while (true) {
      final char c;
      offset++;
      if (bufferIndex >= dataLen) {
        dataLen = input.read(ioBuffer);
        bufferIndex = 0;
      }
      if (dataLen == -1)
        return flush();
      else
        c = ioBuffer[bufferIndex++];
      switch(Character.getType(c)) {
        case Character.DECIMAL_DIGIT_NUMBER:
        case Character.LOWERCASE_LETTER:
        case Character.UPPERCASE_LETTER:
          push(c);
          if (length == MAX_WORD_LEN)
            return flush();
          break;
        case Character.OTHER_LETTER:
          if (length>0) {
            bufferIndex--;
            offset--;
            return flush();
          }
          push(c);
          return flush();
        default:
          if (length>0)
            return flush();
          break;
      }
    }
  }
}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 08 Feb 2011 14:54:09 +0100
parents	408254cf2f1d
children	5df60f24e997