Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; public class MpdlFilter extends TokenFilter { public MpdlFilter(TokenStream in) { super(in); } public Token next() throws IOException { return getNext(null); } public Token next(Token reusableToken) throws IOException { return getNext(reusableToken); } private Token getNext(Token reusableToken) throws IOException { Token nextToken = null; if (reusableToken == null) nextToken = input.next(); else nextToken = input.next(reusableToken); if (nextToken == null) return null; char[] termBuffer = nextToken.termBuffer(); int termBufferLength = nextToken.termLength(); int newTermBufferLength = 0; // if a hyphen or a newlineChar or tabChar is in the token it is removed for(int i=0;i<termBufferLength;i++) { char c = termBuffer[i]; if (c != '-' && c != '\n' && c != '\t') termBuffer[newTermBufferLength++] = c; } // remove the apostrophe with "s" but not always (e.g. not l'ogni but author's) String tokenText = new String(termBuffer, 0, termBufferLength); if (tokenText.endsWith("'s") || tokenText.endsWith("'S")) { newTermBufferLength = newTermBufferLength - 2; } nextToken.setTermLength(newTermBufferLength); return nextToken; } }