mpdl-group: software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java comparison

comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d

Erstellung

author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 24 Nov 2010 17:24:23 +0100
parents
children	2396a569e446

comparison

equal deleted inserted replaced

--1:000000000000
+:408254cf2f1d
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
+import java.io.IOException;
+import java.io.Reader;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+public class MpdlTokenizer extends Tokenizer {
+private static final int MAX_WORD_LEN = 255;
+private static final int IO_BUFFER_SIZE = 1024;
+private String language;  // TODO make the tokenizer language dependent
+private int offset = 0, bufferIndex = 0, dataLen = 0;
+private char[] buffer = new char[MAX_WORD_LEN];
+private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+private MpdlNormalizer normalizer;
+public MpdlTokenizer(Reader input, String language) {
+super(input);
+this.language = language;
+}
+public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
+super(input);
+this.language = language;
+this.normalizer = normalizer;
+}
+/** Returns true iff a character should be included in a token.  This
+* tokenizer generates as tokens adjacent sequences of characters which
+* satisfy this predicate.  Characters for which this is false are used to
+* define token boundaries and are not included in tokens. */
+protected boolean isTokenChar(char c) {
+boolean isTokenChar = true;
+switch (c) {
+case ' ': isTokenChar = false; break;
+case '.': isTokenChar = false; break;
+case ',': isTokenChar = false; break;
+case '!': isTokenChar = false; break;
+case '?': isTokenChar = false; break;
+case ';': isTokenChar = false; break;
+case ':': isTokenChar = false; break;
+case '(': isTokenChar = false; break;
+case ')': isTokenChar = false; break;
+case '[': isTokenChar = false; break;
+case ']': isTokenChar = false; break;
+case '<': isTokenChar = false; break;
+case '>': isTokenChar = false; break;
+case '&': isTokenChar = false; break;
+case '+': isTokenChar = false; break;
+case '"': isTokenChar = false; break;
+case '\'': isTokenChar = false; break;
+// case '\t': isTokenChar = false; break;
+// case '\n': isTokenChar = false; break;  // do not break words which are on another line
+}
+return isTokenChar;
+}
+/** Called on each token character to normalize it before it is added to the
+* token.  The default implementation does nothing.  Subclasses may use this
+* to, e.g., lowercase tokens. */
+protected char normalize(char c) {
+return c;
+}
+/** Returns the next token in the stream, or null at EOS. */
+public final Token next() throws IOException {
+int length = 0;
+int start = offset;
+while (true) {
+final char c;
+offset++;
+if (bufferIndex >= dataLen) {
+dataLen = input.read(ioBuffer);
+bufferIndex = 0;
+}
+if (dataLen == -1) {
+if (length > 0)
+break;
+else
+return null;
+} else {
+c = ioBuffer[bufferIndex++];
+}
+if (isTokenChar(c)) {              // if it's a token char
+if (length == 0)                 // start of token
+start = offset - 1;
+buffer[length++] = normalize(c); // buffer it, normalized
+if (length == MAX_WORD_LEN)      // buffer overflow!
+break;
+} else if (length > 0)             // at non-Letter w/ chars
+break;                           // return 'em
+}
+Token newToken = new Token(start, start + length);
+newToken.setTermBuffer(buffer, 0, length);
+if (normalizer != null) {
+char[] termBuffer = newToken.termBuffer();
+int termBufferLength = newToken.termLength();
+String tokenText = new String(termBuffer, 0, termBufferLength);
+try {
+String normalizedTokenText = normalizer.normalize(tokenText);
+int normalizedTokenTextLength = normalizedTokenText.length();
+char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
+newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
+} catch (ApplicationException e) {
+throw new IOException(e);
+}
+}
+return newToken;
+}
+}

Mercurial > hg > mpdl-group

comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d