diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,113 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class MpdlTokenizer extends Tokenizer {
+  private static final int MAX_WORD_LEN = 255;
+  private static final int IO_BUFFER_SIZE = 1024;
+  private String language;  // TODO make the tokenizer language dependent
+  private int offset = 0, bufferIndex = 0, dataLen = 0;
+  private char[] buffer = new char[MAX_WORD_LEN];
+  private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+  private MpdlNormalizer normalizer;
+
+  public MpdlTokenizer(Reader input, String language) {
+    super(input);
+    this.language = language;
+  }
+
+  public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
+    super(input);
+    this.language = language;
+    this.normalizer = normalizer;
+  }
+
+  /** Returns true iff a character should be included in a token.  This
+   * tokenizer generates as tokens adjacent sequences of characters which
+   * satisfy this predicate.  Characters for which this is false are used to
+   * define token boundaries and are not included in tokens. */
+  protected boolean isTokenChar(char c) {
+    boolean isTokenChar = true;
+    switch (c) {
+      case ' ': isTokenChar = false; break;
+      case '.': isTokenChar = false; break;
+      case ',': isTokenChar = false; break;
+      case '!': isTokenChar = false; break;
+      case '?': isTokenChar = false; break;
+      case ';': isTokenChar = false; break;
+      case ':': isTokenChar = false; break;
+      case '(': isTokenChar = false; break;
+      case ')': isTokenChar = false; break;
+      case '[': isTokenChar = false; break;
+      case ']': isTokenChar = false; break;
+      case '<': isTokenChar = false; break;
+      case '>': isTokenChar = false; break;
+      case '&': isTokenChar = false; break;
+      case '+': isTokenChar = false; break;
+      case '"': isTokenChar = false; break;
+      case '\'': isTokenChar = false; break;
+      // case '\t': isTokenChar = false; break; 
+      // case '\n': isTokenChar = false; break;  // do not break words which are on another line
+    }
+    return isTokenChar;
+  }
+
+  /** Called on each token character to normalize it before it is added to the
+   * token.  The default implementation does nothing.  Subclasses may use this
+   * to, e.g., lowercase tokens. */
+  protected char normalize(char c) {
+    return c;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  public final Token next() throws IOException {
+    int length = 0;
+    int start = offset;
+    while (true) {
+      final char c;
+      offset++;
+      if (bufferIndex >= dataLen) {
+        dataLen = input.read(ioBuffer);
+        bufferIndex = 0;
+      }
+      if (dataLen == -1) {
+        if (length > 0)
+          break;
+        else
+          return null;
+      } else {
+        c = ioBuffer[bufferIndex++];
+      }
+      if (isTokenChar(c)) {              // if it's a token char
+        if (length == 0)                 // start of token
+          start = offset - 1;
+        buffer[length++] = normalize(c); // buffer it, normalized
+        if (length == MAX_WORD_LEN)      // buffer overflow!
+          break;
+      } else if (length > 0)             // at non-Letter w/ chars
+        break;                           // return 'em
+    }
+    Token newToken = new Token(start, start + length);
+    newToken.setTermBuffer(buffer, 0, length);
+    if (normalizer != null) {
+      char[] termBuffer = newToken.termBuffer();
+      int termBufferLength = newToken.termLength();
+      String tokenText = new String(termBuffer, 0, termBufferLength);
+      try {
+        String normalizedTokenText = normalizer.normalize(tokenText);
+        int normalizedTokenTextLength = normalizedTokenText.length();
+        char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
+        newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
+      } catch (ApplicationException e) {
+        throw new IOException(e);        
+      }
+    }
+    return newToken;
+  }
+}