Mercurial > hg > mpdl-group

diff software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 19:4a3641ae14d2
Erstellung
author: Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date: Wed, 09 Nov 2011 15:32:05 +0100
children: 4ea0f81a5d08
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java	Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,233 @@
+package de.mpg.mpiwg.berlin.mpdl.servlets.lt;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Hashtable;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
+import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
+import de.mpg.mpiwg.berlin.mpdl.servlets.util.ServletUtil;
+
+public class Tokenize extends HttpServlet {
+  private static final long serialVersionUID = 1L;
+
+  public Tokenize() {
+    super();
+  }
+
+  public void init(ServletConfig config) throws ServletException  {
+    super.init(config);
+  }
+
+  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+    Date begin = new Date();
+    request.setCharacterEncoding("utf-8");
+    response.setCharacterEncoding("utf-8");
+    String inputString = request.getParameter("inputString");
+    String srcUrlStr = request.getParameter("srcUrl");
+    String language = request.getParameter("language");
+    String normalization = request.getParameter("normalization");
+    String dictionary = request.getParameter("dictionary");
+    String stopElements = request.getParameter("stopElements");
+    String outputFormat = request.getParameter("outputFormat");
+    String outputOptionsStr = request.getParameter("outputOptions");
+    if (language == null)
+      language = "eng";
+    if (normalization == null)
+      normalization = "norm";
+    String[] normFunctions = normalization.split(" ");
+    if (dictionary == null)
+      dictionary = "yes";
+    if (stopElements == null)
+      stopElements = "";
+    String[] stopElementsArray = stopElements.split(" ");
+    if (outputFormat == null)
+      outputFormat = "xml";
+    if (outputOptionsStr == null)
+      outputOptionsStr = "";
+    String[] outputOptions = outputOptionsStr.split(" ");
+    String result = null;
+    try {
+      if (outputFormat.equals("xml")) {
+        response.setContentType("text/xml");
+      } else if (outputFormat.equals("string")) {
+        response.setContentType("text/html");
+      } else { 
+        response.setContentType("text/xml");
+      }
+      response.setCharacterEncoding("utf-8");
+      PrintWriter out = response.getWriter();
+      String inputText = null;  // contains string or xml text
+      if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) {
+        out.print("request parameter \"inputString\" or  \"srcUrl\" is empty. Please specify \"inputString\"");
+        out.close();
+        return;
+      } else {
+        if (srcUrlStr != null && ! srcUrlStr.isEmpty()) {
+          URL srcUrl = new URL(srcUrlStr);
+          InputStream inputStream = srcUrl.openStream();
+          BufferedInputStream in = new BufferedInputStream(inputStream);
+          inputText = IOUtils.toString(in, "utf-8");
+          in.close();
+        } else if (inputString != null && ! inputString.isEmpty()) {
+          inputText = inputString;
+        }
+      }
+      inputText = inputText.trim();
+      // Tokenize
+      boolean inputTextIsXml = false;
+      if (inputText != null && inputText.startsWith("<")  && inputText.endsWith(">"))  // TODO check properly for xml type of the inputText
+        inputTextIsXml = true;
+      if (! inputTextIsXml) {
+        ArrayList<String> tokens = getToken(inputText, language, normFunctions);
+        Hashtable<String, ArrayList<Lexicon>> tokensDictionaries = null;
+        if (dictionary.equals("yes")) {
+          tokensDictionaries = new Hashtable<String, ArrayList<Lexicon>>();
+          LexHandler lexHandler = LexHandler.getInstance();
+          for (int i = 0; i < tokens.size(); i++) {
+            String token = tokens.get(i);
+            ArrayList<Lemma> lemmas = lexHandler.getLemmas(token, "form", language, "none");
+            ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, null);
+            tokensDictionaries.put(token, dictionaries);
+          }
+        }
+        String baseUrl = ServletUtil.getInstance().getBaseUrl(request);
+        Date end = new Date();
+        String elapsedTime = String.valueOf(end.getTime() - begin.getTime());
+        if (outputFormat.equals("xml"))
+          result = createXmlOutputString(tokens, tokensDictionaries, baseUrl, elapsedTime);
+        else if (outputFormat.equals("string"))
+          result = createStringOutputString(tokens);
+        else
+          result = "<result><error>outputFormat: \"" + outputFormat + "\" is not supported</error></result>";
+      } else {
+        StringReader xmlInputStringReader = new StringReader(inputText);
+        XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
+        xmlTokenizer.setLanguage(language);
+        xmlTokenizer.setNormFunctions(normFunctions);
+        xmlTokenizer.setOutputOptions(outputOptions);
+        if (stopElementsArray != null)
+          xmlTokenizer.setStopElements(stopElementsArray);
+        result = xmlTokenizer.tokenize();
+      }
+      if (result != null)
+        out.print(result);
+      out.close();
+    } catch (ApplicationException e) { 
+      throw new ServletException(e);
+    }
+  }
+
+  private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException {
+    ArrayList<String> retTokens = null;
+    try {
+      StringReader reader = new StringReader(inputString);
+      Tokenizer tokenizer = new Tokenizer(reader);
+      tokenizer.setLanguage(language);
+      tokenizer.setNormFunctions(normFunctions);
+      ArrayList<Token> tokens = tokenizer.getTokens();
+      if (tokens != null) {
+        retTokens = new ArrayList<String>();
+        for (int i=0; i<tokens.size(); i++) {
+          Token t = tokens.get(i);
+          retTokens.add(t.getContent());
+        }
+      }
+      tokenizer.end();
+      tokenizer.close();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return retTokens;
+  }
+
+  private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) {
+    String result = "<result>";
+    result = result + "<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>";
+    result = result + "<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>";
+    if (tokens != null && ! tokens.isEmpty()) {
+      result = result + "<tokens>";
+      for (int i=0; i<tokens.size(); i++) {
+        String token = tokens.get(i);
+        result = result + "<token>";
+        result = result + "<name>" + token + "</name>";
+        if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) {
+          ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token);
+          if (tokenDictionaries != null) {
+            result = result + "<dictionaries>";
+            for (int j=0; j<tokenDictionaries.size(); j++) {
+              Lexicon lexicon = tokenDictionaries.get(j);
+              result = result + lexicon.toXmlString();
+            }
+            result = result + "</dictionaries>";
+          }
+        }
+        result = result + "</token>";
+      }
+      result = result + "</tokens>";
+    }
+    result = result + "</result>";
+    return result;
+  }  
+  
+  private String createStringOutputString(ArrayList<String> tokens) {
+    String result = "";
+    if (tokens != null && ! tokens.isEmpty()) {
+      for (int i=0; i<tokens.size(); i++) {
+        String token = tokens.get(i);
+        result = result + token + " ";
+      }
+      result = result.substring(0, result.length() - 1);  // without last blank
+    }
+    return result;
+  }  
+
+  private ArrayList<String> getTokenOld(String inputString, String language, String[] normFunctions) throws ApplicationException {
+    ArrayList<String> tokens = new ArrayList<String>();
+    try {
+      StringReader reader = new StringReader(inputString);
+      Tokenizer tokenizer = new Tokenizer(reader);
+      tokenizer.setLanguage(language);
+      tokenizer.setNormFunctions(normFunctions);
+      // tokenizer.reset();
+      /*
+      result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
+      result = new LowerCaseFilter(result);
+      result = new StopFilter(result, stopSet);
+      */
+      CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
+      // Token token = tokenizer.getAttribute(Token.class);
+      while (tokenizer.incrementToken()) {
+        // String tokenStr = token.toString();
+        String term = charTermAttribute.toString();
+        tokens.add(term);
+      }
+      tokenizer.end();
+      tokenizer.close();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return tokens;
+  }
+  
+}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 09 Nov 2011 15:32:05 +0100
parents
children	4ea0f81a5d08