Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,113 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MpdlTokenizer extends Tokenizer { + private static final int MAX_WORD_LEN = 255; + private static final int IO_BUFFER_SIZE = 1024; + private String language; // TODO make the tokenizer language dependent + private int offset = 0, bufferIndex = 0, dataLen = 0; + private char[] buffer = new char[MAX_WORD_LEN]; + private char[] ioBuffer = new char[IO_BUFFER_SIZE]; + private MpdlNormalizer normalizer; + + public MpdlTokenizer(Reader input, String language) { + super(input); + this.language = language; + } + + public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) { + super(input); + this.language = language; + this.normalizer = normalizer; + } + + /** Returns true iff a character should be included in a token. This + * tokenizer generates as tokens adjacent sequences of characters which + * satisfy this predicate. Characters for which this is false are used to + * define token boundaries and are not included in tokens. */ + protected boolean isTokenChar(char c) { + boolean isTokenChar = true; + switch (c) { + case ' ': isTokenChar = false; break; + case '.': isTokenChar = false; break; + case ',': isTokenChar = false; break; + case '!': isTokenChar = false; break; + case '?': isTokenChar = false; break; + case ';': isTokenChar = false; break; + case ':': isTokenChar = false; break; + case '(': isTokenChar = false; break; + case ')': isTokenChar = false; break; + case '[': isTokenChar = false; break; + case ']': isTokenChar = false; break; + case '<': isTokenChar = false; break; + case '>': isTokenChar = false; break; + case '&': isTokenChar = false; break; + case '+': isTokenChar = false; break; + case '"': isTokenChar = false; break; + case '\'': isTokenChar = false; break; + // case '\t': isTokenChar = false; break; + // case '\n': isTokenChar = false; break; // do not break words which are on another line + } + return isTokenChar; + } + + /** Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this + * to, e.g., lowercase tokens. */ + protected char normalize(char c) { + return c; + } + + /** Returns the next token in the stream, or null at EOS. */ + public final Token next() throws IOException { + int length = 0; + int start = offset; + while (true) { + final char c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + } + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } else { + c = ioBuffer[bufferIndex++]; + } + if (isTokenChar(c)) { // if it's a token char + if (length == 0) // start of token + start = offset - 1; + buffer[length++] = normalize(c); // buffer it, normalized + if (length == MAX_WORD_LEN) // buffer overflow! + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + Token newToken = new Token(start, start + length); + newToken.setTermBuffer(buffer, 0, length); + if (normalizer != null) { + char[] termBuffer = newToken.termBuffer(); + int termBufferLength = newToken.termLength(); + String tokenText = new String(termBuffer, 0, termBufferLength); + try { + String normalizedTokenText = normalizer.normalize(tokenText); + int normalizedTokenTextLength = normalizedTokenText.length(); + char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray(); + newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength); + } catch (ApplicationException e) { + throw new IOException(e); + } + } + return newToken; + } +}