view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

public class MpdlFilter extends TokenFilter {

  public MpdlFilter(TokenStream in) {
    super(in);
  }

  public Token next() throws IOException {
    return getNext(null);
  }
  
  public Token next(Token reusableToken) throws IOException {
    return getNext(reusableToken);
  }

  private Token getNext(Token reusableToken) throws IOException {
    Token nextToken = null;
    if (reusableToken == null)
      nextToken = input.next();
    else
      nextToken = input.next(reusableToken);
    if (nextToken == null)
      return null;
    char[] termBuffer = nextToken.termBuffer();
    int termBufferLength = nextToken.termLength();
    int newTermBufferLength = 0;
    // if a hyphen or a newlineChar or tabChar is in the token it is removed
    for(int i=0;i<termBufferLength;i++) {
      char c = termBuffer[i];
      if (c != '-' && c != '\n' && c != '\t')
        termBuffer[newTermBufferLength++] = c;
    }
    // remove the apostrophe with "s" but not always  (e.g. not l'ogni but author's)
    String tokenText = new String(termBuffer, 0, termBufferLength);
    if (tokenText.endsWith("'s") || tokenText.endsWith("'S")) {
      newTermBufferLength = newTermBufferLength - 2;
    }
    nextToken.setTermLength(newTermBufferLength);
    return nextToken;
  }
}