Mercurial > hg > mpdl-group

package de.mpg.mpiwg.berlin.mpdl.servlets.lt;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;

import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.servlets.util.ServletUtil;

public class Tokenize extends HttpServlet {
  private static final long serialVersionUID = 1L;

  public Tokenize() {
    super();
  }

  public void init(ServletConfig config) throws ServletException  {
    super.init(config);
  }

  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    Date begin = new Date();
    request.setCharacterEncoding("utf-8");
    response.setCharacterEncoding("utf-8");
    String inputString = request.getParameter("inputString");
    String srcUrlStr = request.getParameter("srcUrl");
    String language = request.getParameter("language");
    String normalization = request.getParameter("normalization");
    String dictionary = request.getParameter("dictionary");
    String stopElements = request.getParameter("stopElements");
    String outputFormat = request.getParameter("outputFormat");
    String outputOptionsStr = request.getParameter("outputOptions");
    if (language == null)
      language = "eng";
    if (normalization == null)
      normalization = "norm";
    String[] normFunctions = normalization.split(" ");
    if (dictionary == null)
      dictionary = "yes";
    if (stopElements == null)
      stopElements = "";
    String[] stopElementsArray = stopElements.split(" ");
    if (outputFormat == null)
      outputFormat = "xml";
    if (outputOptionsStr == null)
      outputOptionsStr = "";
    String[] outputOptions = outputOptionsStr.split(" ");
    String result = null;
    try {
      if (outputFormat.equals("xml")) {
        response.setContentType("text/xml");
      } else if (outputFormat.equals("string")) {
        response.setContentType("text/html");
      } else {
        response.setContentType("text/xml");
      }
      response.setCharacterEncoding("utf-8");
      PrintWriter out = response.getWriter();
      String inputText = null;  // contains string or xml text
      if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) {
        out.print("request parameter \"inputString\" or  \"srcUrl\" is empty. Please specify \"inputString\"");
        out.close();
        return;
      } else {
        if (srcUrlStr != null && ! srcUrlStr.isEmpty()) {
          URL srcUrl = new URL(srcUrlStr);
          InputStream inputStream = srcUrl.openStream();
          BufferedInputStream in = new BufferedInputStream(inputStream);
          inputText = IOUtils.toString(in, "utf-8");
          in.close();
        } else if (inputString != null && ! inputString.isEmpty()) {
          inputText = inputString;
        }
      }
      inputText = inputText.trim();
      // Tokenize
      boolean inputTextIsXml = false;
      if (inputText != null && inputText.startsWith("<")  && inputText.endsWith(">"))  // TODO check properly for xml type of the inputText
        inputTextIsXml = true;
      if (! inputTextIsXml) {
        ArrayList<String> tokens = getToken(inputText, language, normFunctions);
        Hashtable<String, ArrayList<Lexicon>> tokensDictionaries = null;
        if (dictionary.equals("yes")) {
          tokensDictionaries = new Hashtable<String, ArrayList<Lexicon>>();
          LexHandler lexHandler = LexHandler.getInstance();
          for (int i = 0; i < tokens.size(); i++) {
            String token = tokens.get(i);
            ArrayList<Lemma> lemmas = lexHandler.getLemmas(token, "form", language, "none");
            ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, null);
            tokensDictionaries.put(token, dictionaries);
          }
        }
        String baseUrl = ServletUtil.getInstance().getBaseUrl(request);
        Date end = new Date();
        String elapsedTime = String.valueOf(end.getTime() - begin.getTime());
        if (outputFormat.equals("xml"))
          result = createXmlOutputString(tokens, tokensDictionaries, baseUrl, elapsedTime);
        else if (outputFormat.equals("string"))
          result = createStringOutputString(tokens);
        else
          result = "<result><error>outputFormat: \"" + outputFormat + "\" is not supported</error></result>";
      } else {
        StringReader xmlInputStringReader = new StringReader(inputText);
        XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
        xmlTokenizer.setLanguage(language);
        xmlTokenizer.setNormFunctions(normFunctions);
        xmlTokenizer.setOutputOptions(outputOptions);
        if (stopElementsArray != null)
          xmlTokenizer.setStopElements(stopElementsArray);
        result = xmlTokenizer.tokenize();
      }
      if (result != null)
        out.print(result);
      out.close();
    } catch (ApplicationException e) {
      throw new ServletException(e);
    }
  }

  private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException {
    ArrayList<String> retTokens = null;
    try {
      StringReader reader = new StringReader(inputString);
      Tokenizer tokenizer = new Tokenizer(reader);
      tokenizer.setLanguage(language);
      tokenizer.setNormFunctions(normFunctions);
      ArrayList<Token> tokens = tokenizer.getTokens();
      if (tokens != null) {
        retTokens = new ArrayList<String>();
        for (int i=0; i<tokens.size(); i++) {
          Token t = tokens.get(i);
          retTokens.add(t.getContent());
        }
      }
      tokenizer.end();
      tokenizer.close();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return retTokens;
  }

  private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) {
    String result = "<result>";
    result = result + "<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>";
    result = result + "<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>";
    if (tokens != null && ! tokens.isEmpty()) {
      result = result + "<tokens>";
      for (int i=0; i<tokens.size(); i++) {
        String token = tokens.get(i);
        result = result + "<token>";
        result = result + "<name>" + token + "</name>";
        if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) {
          ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token);
          if (tokenDictionaries != null) {
            result = result + "<dictionaries>";
            for (int j=0; j<tokenDictionaries.size(); j++) {
              Lexicon lexicon = tokenDictionaries.get(j);
              result = result + lexicon.toXmlString();
            }
            result = result + "</dictionaries>";
          }
        }
        result = result + "</token>";
      }
      result = result + "</tokens>";
    }
    result = result + "</result>";
    return result;
  }

  private String createStringOutputString(ArrayList<String> tokens) {
    String result = "";
    if (tokens != null && ! tokens.isEmpty()) {
      for (int i=0; i<tokens.size(); i++) {
        String token = tokens.get(i);
        result = result + token + " ";
      }
      result = result.substring(0, result.length() - 1);  // without last blank
    }
    return result;
  }

  private ArrayList<String> getTokenOld(String inputString, String language, String[] normFunctions) throws ApplicationException {
    ArrayList<String> tokens = new ArrayList<String>();
    try {
      StringReader reader = new StringReader(inputString);
      Tokenizer tokenizer = new Tokenizer(reader);
      tokenizer.setLanguage(language);
      tokenizer.setNormFunctions(normFunctions);
      // tokenizer.reset();
      /*
      result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
      result = new LowerCaseFilter(result);
      result = new StopFilter(result, stopSet);
      */
      CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
      // Token token = tokenizer.getAttribute(Token.class);
      while (tokenizer.incrementToken()) {
        // String tokenStr = token.toString();
        String term = charTermAttribute.toString();
        tokens.add(term);
      }
      tokenizer.end();
      tokenizer.close();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return tokens;
  }

}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 09 Nov 2011 15:32:05 +0100
parents
children	4ea0f81a5d08