view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;

import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.util.Version;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;

public class Tokenizer extends org.apache.lucene.analysis.Tokenizer {
  // variables are copied from Lucene 3.4. CharTokenizer 
  private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
  private static int MAX_WORD_LEN = 4096;  // old value was 255
  private static int IO_BUFFER_SIZE = 4096;
  private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34);
  private CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
  // application variables  
  private String language = "eng";  // default: english
  private String[] normFunctions = {"norm"};  // default: use norm function
  private Normalizer normalizer;
  
  public Tokenizer(Reader input) {
    super(input);
  }

  public Tokenizer(AttributeSource source, Reader input) {
    super(source, input);
  }

  public void setLanguage(String lang) {
    String language = Language.getInstance().getLanguageId(lang); 
    this.language = language;
  }

  public void setNormFunctions(String[] normFunctions) {
    this.normFunctions = normFunctions;
  }

  public ArrayList<Token> getTokens() throws ApplicationException {
    if (Language.getInstance().isChinese(language)) {
      return getTokensByChineseTokenizer(input, normFunctions);
    }
    ArrayList<Token> tokens = new ArrayList<Token>();
    try {
      reset(input);
      CharTermAttribute charTermAttribute = getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAttribute = getAttribute(OffsetAttribute.class);
      while (incrementToken()) {
        String term = charTermAttribute.toString();
        int start = offsetAttribute.startOffset();
        int end = offsetAttribute.endOffset();
        String normedTerm = normalizer.normalize(term);
        Token token = new Token(start, end, normedTerm);
        tokens.add(token);
      }
      end();  // TODO needed ?
      close(); // TODO needed ?
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return tokens;
  }

  /** Returns true iff a character should be included in a token. */
  protected boolean isTokenChar(int codepoint) {
    boolean isTokenChar = true;
    char c = (char) codepoint;
    switch (c) {
      case ' ': isTokenChar = false; break;
      case '.': isTokenChar = false; break;
      case ',': isTokenChar = false; break;
      case '!': isTokenChar = false; break;
      case '?': isTokenChar = false; break;
      case ';': isTokenChar = false; break;
      case ':': isTokenChar = false; break;
      case '(': isTokenChar = false; break;
      case ')': isTokenChar = false; break;
      case '[': isTokenChar = false; break;
      case ']': isTokenChar = false; break;
      case '{': isTokenChar = false; break;
      case '}': isTokenChar = false; break;
      case '<': isTokenChar = false; break;
      case '>': isTokenChar = false; break;
      case '/': isTokenChar = false; break;
      case '=': isTokenChar = false; break;
      case '&': isTokenChar = false; break;
      case '+': isTokenChar = false; break;
      case '#': isTokenChar = false; break;
      case '"': isTokenChar = false; break;
      case 'ã': isTokenChar = false; break;
      case 'Ò': isTokenChar = false; break;
      case 'Ç': isTokenChar = false; break;
      case 'È': isTokenChar = false; break;
      case '\'': isTokenChar = false; break;
      case '\t': isTokenChar = false; break; // do not break words which have tabs in it
      case '\n': isTokenChar = false; break;  // do not break words which are on another line 
      case '\u2425': isTokenChar = false; break;  // special char for marking xml elements 
    }
    return isTokenChar;
  }
  
  /** Called on each token character to normalize it before it is added to the
   * token.  The default implementation does nothing.  Subclasses may use this
   * to, e.g., lowercase tokens. */
  protected char normalize(char c) {
    return c;
  }
  protected int normalize(int c) {
    return c;
  }
  
  /*
   * Code is copied from Lucene 3.4. CharTokenizer.incrementToken() 
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  public boolean incrementToken() throws IOException {
    clearAttributes();  
    int length = 0;
    int start = -1; // this variable is always initialized
    char[] buffer = termAtt.buffer();
    while (true) {
      if (bufferIndex >= dataLen) {
        offset += dataLen;
        if(! charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
            break;
          } else {
            finalOffset = correctOffset(offset);
            return false;
          }
        }
        dataLen = ioBuffer.getLength();
        bufferIndex = 0;
      }
      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
      int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
      bufferIndex += Character.charCount(c);
      if (isTokenChar(c)) { // if it's a token char
        if (length == 0) { // start of token
          start = offset + bufferIndex - 1;
        } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
          buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer
        }
        length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
        if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
          break;
      } else if (length > 0) // at non-Letter w/ chars
        break; // return 'em
    }
    termAtt.setLength(length);
    offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
    return true;
  }

  /*
   * Code is copied from Lucene 3.4. CharTokenizer.end() 
   * @see org.apache.lucene.analysis.TokenStream#end()
   */
  @Override
  public final void end() {
    // set final offset
    offsetAtt.setOffset(finalOffset, finalOffset);
  }

  /*
   * Code is copied from Lucene 3.4. CharTokenizer.reset() 
   * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
   */
  @Override
  public void reset(Reader input) throws IOException {
    super.reset(input);
    bufferIndex = 0;
    offset = 0;
    dataLen = 0;
    finalOffset = 0;
    ioBuffer.reset(); // make sure to reset the IO buffer!!
    this.normalizer = new Normalizer(normFunctions, language);  
  }

  private ArrayList<Token> getTokensByChineseTokenizer(Reader input, String[] normFunctions) throws ApplicationException {
    StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_34, input);  // is recommended instead of ChineseTokenizer which is deprecated
    ArrayList<Token> tokens = new ArrayList<Token>();
    try {
      reset(input);
      chineseTokenizer.reset(input);
      CharTermAttribute charTermAttribute = chineseTokenizer.getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAttribute = chineseTokenizer.getAttribute(OffsetAttribute.class);
      while (chineseTokenizer.incrementToken()) {
        String term = charTermAttribute.toString();
        String normedTerm = normalizer.normalize(term);
        int start = offsetAttribute.startOffset();
        int end = offsetAttribute.endOffset();
        Token token = new Token(start, end, normedTerm);
        tokens.add(token);
      }
      chineseTokenizer.end();  // TODO needed ?
      chineseTokenizer.close(); // TODO needed ?
      end();  // TODO needed ?
      close(); // TODO needed ?
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return tokens;
  }

}