diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java	Tue Nov 27 12:35:19 2012 +0100
@@ -0,0 +1,162 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
+
+/**
+ * own simple implementation of Tokenizer
+ * @author jwillenborg
+ *
+ */
+public class TokenizerNew  {
+  private StringReader input;
+  private String language = "eng";  // default: english
+  private Normalizer normalizer;
+  private ArrayList<Token> tokens;
+  
+  public TokenizerNew(StringReader input, String language) {
+    this.input = input;
+    String[] normFunctions = {"norm"};
+    this.language = language;
+    this.normalizer = new Normalizer(normFunctions, language);  
+  }
+
+  public ArrayList<Token> tokenize() throws ApplicationException {
+    if (Language.getInstance().isChinese(language))
+      return tokenizeChinese();
+    else 
+      return tokenizeAll();
+  }
+  
+  private ArrayList<Token> tokenizeAll() throws ApplicationException {
+    tokens = new ArrayList<Token>();
+    try {
+      int cInt = -1;
+      StringBuilder tokenStr = new StringBuilder();
+      int pos = 0;
+      boolean isTokenChar = false;  // last state
+      int tokenStart = -1;
+      int tokenEnd = -1;
+      while ((cInt = input.read()) != -1) {
+        char c = (char) cInt;
+        if (isTokenChar(c)) {
+          if (! isTokenChar) {
+            tokenStr = new StringBuilder();
+            tokenStart = pos;
+          }
+          tokenStr.append(c);
+          isTokenChar = true;
+        } else {
+          if (isTokenChar) {
+            tokenEnd = pos - 1;
+            String tStr = tokenStr.toString();
+            addToken(tokenStart, tokenEnd, tStr);
+          }
+          isTokenChar = false;
+        }
+        pos++;
+      }
+      // add last token if last char of input is a token char
+      if (isTokenChar) {
+        tokenEnd = pos - 1;
+        String tStr = tokenStr.toString();
+        addToken(tokenStart, tokenEnd, tStr);
+      }
+      input.close(); 
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    if (tokens.isEmpty())
+      return null;
+    else 
+      return tokens;
+  }
+
+  /**
+   * each token character is a single token
+   * @return
+   * @throws ApplicationException
+   */
+  private ArrayList<Token> tokenizeChinese() throws ApplicationException {
+    tokens = new ArrayList<Token>();
+    try {
+      int cInt = -1;
+      int pos = 0;
+      while ((cInt = input.read()) != -1) {
+        char c = (char) cInt;
+        if (isTokenChar(c)) {
+          String tStr = String.valueOf(c);
+          addToken(pos, pos + 1, tStr);
+        }
+        pos++;
+      }
+      input.close(); 
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    if (tokens.isEmpty())
+      return null;
+    else 
+      return tokens;
+  }
+
+  private boolean isTokenChar(char c) {
+    boolean isTokenChar = true;
+    switch (c) {
+      case ' ': isTokenChar = false; break;
+      case '.': isTokenChar = false; break;
+      case ',': isTokenChar = false; break;
+      case '!': isTokenChar = false; break;
+      case '?': isTokenChar = false; break;
+      case ';': isTokenChar = false; break;
+      case ':': isTokenChar = false; break;
+      case '(': isTokenChar = false; break;
+      case ')': isTokenChar = false; break;
+      case '[': isTokenChar = false; break;
+      case ']': isTokenChar = false; break;
+      case '{': isTokenChar = false; break;
+      case '}': isTokenChar = false; break;
+      case '<': isTokenChar = false; break;
+      case '>': isTokenChar = false; break;
+      case '/': isTokenChar = false; break;
+      case '=': isTokenChar = false; break;
+      case '&': isTokenChar = false; break;
+      case '+': isTokenChar = false; break;
+      case '#': isTokenChar = false; break;
+      case '"': isTokenChar = false; break;
+      case '„': isTokenChar = false; break;
+      case '“': isTokenChar = false; break;
+      case '«': isTokenChar = false; break;
+      case '»': isTokenChar = false; break;
+      case '\'': isTokenChar = false; break;
+      case '\t': isTokenChar = false; break; // do not break words which have tabs in it
+      case '\n': isTokenChar = false; break;  // do not break words which are on another line 
+      case '\u2425': isTokenChar = false; break;  // special char for marking xml elements 
+    }
+    return isTokenChar;
+  }
+  
+  private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException {
+    tokenStr = removeElementMarks(tokenStr);
+    tokenStr = removeSpecialSymbols(tokenStr);  // e.g. "-" is deleted so that the normalizer works properly
+    String tStrNormed = normalizer.normalize(tokenStr);
+    Token token = new Token(startPos, endPos, tokenStr);
+    token.setContentNorm(tStrNormed);
+    tokens.add(token);
+  }
+  
+  private String removeElementMarks(String inputStr) {
+    String retStr = inputStr.replaceAll("\u2424|\u2425", "");
+    return retStr;
+  }
+
+  private String removeSpecialSymbols(String inputStr) {
+    String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
+    return retStr;
+  }
+}
\ No newline at end of file