diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children 5df60f24e997
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Feb 08 14:54:09 2011 +0100
@@ -11,11 +11,14 @@
 public class MpdlTokenizer extends Tokenizer {
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 1024;
-  private String language;  // TODO make the tokenizer language dependent
+  private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
+  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
+  private boolean isInNotWordDelimMode = false;
   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private char[] buffer = new char[MAX_WORD_LEN];
   private char[] ioBuffer = new char[IO_BUFFER_SIZE];
   private MpdlNormalizer normalizer;
+  private String language;
 
   public MpdlTokenizer(Reader input, String language) {
     super(input);
@@ -28,12 +31,22 @@
     this.normalizer = normalizer;
   }
 
+  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
+    this.regWithoutSemicolon = regWithoutSemicolon;  
+  }
+  
+  public boolean isRegWithoutSemicolon() {
+    return regWithoutSemicolon;  
+  }
+  
   /** Returns true iff a character should be included in a token.  This
    * tokenizer generates as tokens adjacent sequences of characters which
    * satisfy this predicate.  Characters for which this is false are used to
    * define token boundaries and are not included in tokens. */
   protected boolean isTokenChar(char c) {
     boolean isTokenChar = true;
+    if (isRegWithoutSemicolon() && c == ';')  // hack: special case for regularization and normalization; feel free to remove it later
+      return true;
     switch (c) {
       case ' ': isTokenChar = false; break;
       case '.': isTokenChar = false; break;
@@ -51,12 +64,37 @@
       case '&': isTokenChar = false; break;
       case '+': isTokenChar = false; break;
       case '"': isTokenChar = false; break;
+      case '„': isTokenChar = false; break;
+      case '“': isTokenChar = false; break;
+      case '«': isTokenChar = false; break;
+      case '»': isTokenChar = false; break;
       case '\'': isTokenChar = false; break;
-      // case '\t': isTokenChar = false; break; 
-      // case '\n': isTokenChar = false; break;  // do not break words which are on another line
+      case '\t': isTokenChar = false; break; // do not break words which have tabs in it
+      case '\n': isTokenChar = false; break;  // do not break words which are on another line 
     }
     return isTokenChar;
   }
+  
+  protected boolean isTokenCharInNotWordDelimMode(char c) {
+    boolean isTokenCharInNotWordDelimMode = false;
+    if (isInNotWordDelimMode) {
+      switch (c) {
+        case ' ': isTokenCharInNotWordDelimMode = true; break;
+        case '\t': isTokenCharInNotWordDelimMode = true; break; 
+        case '\n': isTokenCharInNotWordDelimMode = true; break; 
+      }
+    }
+    return isTokenCharInNotWordDelimMode;
+  }
+  
+  protected boolean isSpecialNotWordDelimSymbol(char c) {
+    boolean isSpecialNotWordDelimSymbol = false;
+    switch (c) {
+      case '\u2424': isSpecialNotWordDelimSymbol = true; break;  // unicode character for newline
+    }
+    return isSpecialNotWordDelimSymbol;
+  }
+  
 
   /** Called on each token character to normalize it before it is added to the
    * token.  The default implementation does nothing.  Subclasses may use this
@@ -67,6 +105,8 @@
 
   /** Returns the next token in the stream, or null at EOS. */
   public final Token next() throws IOException {
+    if (language != null && language.equals("zh"))
+      return nextChinese();
     int length = 0;
     int start = offset;
     while (true) {
@@ -84,7 +124,13 @@
       } else {
         c = ioBuffer[bufferIndex++];
       }
-      if (isTokenChar(c)) {              // if it's a token char
+      if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) {
+        isInNotWordDelimMode = false;
+      }
+      if (isSpecialNotWordDelimSymbol(c)) { 
+        isInNotWordDelimMode = true;
+      }
+      if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) {              // if it's a token char
         if (length == 0)                 // start of token
           start = offset - 1;
         buffer[length++] = normalize(c); // buffer it, normalized
@@ -93,8 +139,10 @@
       } else if (length > 0)             // at non-Letter w/ chars
         break;                           // return 'em
     }
+    isInNotWordDelimMode = false;
     Token newToken = new Token(start, start + length);
     newToken.setTermBuffer(buffer, 0, length);
+    removeSpecialSymbols(newToken);  // remove some special symbols in token (e.g. symbol for word delimiting xml elements)
     if (normalizer != null) {
       char[] termBuffer = newToken.termBuffer();
       int termBufferLength = newToken.termLength();
@@ -110,4 +158,75 @@
     }
     return newToken;
   }
+  
+  private Token removeSpecialSymbols(Token token) {
+    char[] termBuffer = token.termBuffer();
+    int termBufferLength = token.termLength();
+    String tokenText = new String(termBuffer, 0, termBufferLength);
+    String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");  // a symbol which marks word delimiting xml elements
+    int newTokenTextLength = newTokenText.length();
+    char[] newTokenTextBuffer = newTokenText.toCharArray();
+    token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength);
+    return token;
+  }
+  
+
+  
+  /* 
+   * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer    
+   * 
+   */
+  private int length;
+  private int start;
+
+  private final void push(char c) {
+    if (length == 0) start = offset-1;            // start of token
+    buffer[length++] = Character.toLowerCase(c);  // buffer it
+  }
+
+  private final Token flush() {
+    if (length>0) {
+      return new Token(new String(buffer, 0, length), start, start+length);
+    }
+    else
+      return null;
+  }
+
+  public final Token nextChinese() throws IOException {
+    length = 0;
+    start = offset;
+    while (true) {
+      final char c;
+      offset++;
+      if (bufferIndex >= dataLen) {
+        dataLen = input.read(ioBuffer);
+        bufferIndex = 0;
+      }
+      if (dataLen == -1) 
+        return flush();
+      else
+        c = ioBuffer[bufferIndex++];
+      switch(Character.getType(c)) {
+        case Character.DECIMAL_DIGIT_NUMBER:
+        case Character.LOWERCASE_LETTER:
+        case Character.UPPERCASE_LETTER:
+          push(c);
+          if (length == MAX_WORD_LEN) 
+            return flush();
+          break;
+        case Character.OTHER_LETTER:
+          if (length>0) {
+            bufferIndex--;
+            offset--;
+            return flush();
+          }
+          push(c);
+          return flush();
+        default:
+          if (length>0) 
+            return flush();
+          break;
+      }
+    }
+  }
 }