view software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.servlets.lt;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;

import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.io.IOUtils;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;

public class Tokenize extends HttpServlet {
  private static final long serialVersionUID = 1L;

  public Tokenize() {
    super();
  }

  public void init(ServletConfig config) throws ServletException  {
    super.init(config);
  }

  protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    doGet(request, response);
  }  

  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    request.setCharacterEncoding("utf-8");
    response.setCharacterEncoding("utf-8");
    String inputString = request.getParameter("inputString");
    String srcUrlStr = request.getParameter("srcUrl");
    String language = request.getParameter("language");
    String normalization = request.getParameter("normalization");
    String stopElements = request.getParameter("stopElements");
    String elements = request.getParameter("elements");
    String highlightTerms = request.getParameter("highlightTerms");
    String outputFormat = request.getParameter("outputFormat");
    String outputOptionsStr = request.getParameter("outputOptions");
    if (language == null)
      language = "eng";
    if (normalization == null)
      normalization = "norm";
    String[] normFunctions = normalization.split(" ");
    String[] stopElementsArray = null;
    if (stopElements != null && ! stopElements.equals(""))
      stopElementsArray = stopElements.split(" ");
    String[] elementsArray = null;
    if (elements != null && ! elements.equals(""))
      elementsArray = elements.split(" ");
    if (highlightTerms == null)
      highlightTerms = "";
    String[] highlightTermsArray = highlightTerms.split(" ");
    if (outputFormat == null)
      outputFormat = "xml";
    if (outputOptionsStr == null)
      outputOptionsStr = "";
    String[] outputOptions = outputOptionsStr.split(" ");
    String result = null;
    try {
      if (outputFormat.equals("xml")) {
        response.setContentType("text/xml");
      } else if (outputFormat.equals("string")) {
        response.setContentType("text/html");
      } else { 
        response.setContentType("text/xml");
      }
      response.setCharacterEncoding("utf-8");
      PrintWriter out = response.getWriter();
      String inputText = null;  // contains string or xml text
      if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) {
        out.print("request parameter \"inputString\" or  \"srcUrl\" is empty. Please specify \"inputString\"");
        out.close();
        return;
      } else {
        if (srcUrlStr != null && ! srcUrlStr.isEmpty()) {
          URL srcUrl = new URL(srcUrlStr);
          InputStream inputStream = srcUrl.openStream();
          BufferedInputStream in = new BufferedInputStream(inputStream);
          inputText = IOUtils.toString(in, "utf-8");
          in.close();
        } else if (inputString != null && ! inputString.isEmpty()) {
          inputText = inputString;
        }
      }
      inputText = inputText.trim();
      // Tokenize
      boolean inputTextIsXml = false;
      if (inputText != null && inputText.startsWith("<")  && inputText.endsWith(">"))  // TODO check properly for xml type of the inputText
        inputTextIsXml = true;
      if (! inputTextIsXml) {
        inputText = "<result>" + inputText + "</result>";
      }
      StringReader xmlInputStringReader = new StringReader(inputText);
      XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
      xmlTokenizer.setDocIdentifier(srcUrlStr);  // TODO
      xmlTokenizer.setLanguage(language);
      xmlTokenizer.setNormFunctions(normFunctions);
      xmlTokenizer.setOutputFormat(outputFormat);
      xmlTokenizer.setOutputOptions(outputOptions);
      if (stopElementsArray != null && stopElementsArray.length > 0)
        xmlTokenizer.setStopElements(stopElementsArray);
      if (elementsArray != null && elementsArray.length > 0)
        xmlTokenizer.setElements(elementsArray);
      if (highlightTermsArray != null && highlightTermsArray.length > 0)
        xmlTokenizer.setHighlightTerms(highlightTermsArray);
      xmlTokenizer.tokenize();
      if (outputFormat != null && outputFormat.equals("xml")) {  
        result = xmlTokenizer.getXmlResult();
      } else {  // outputFormat == string
        result = xmlTokenizer.getStringResult();
      }
      if (result != null)
        out.print(result);
      out.close();
    } catch (ApplicationException e) { 
      throw new ServletException(e);
    }
  }

  private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException {
    ArrayList<String> retTokens = null;
    try {
      StringReader reader = new StringReader(inputString);
      Tokenizer tokenizer = new Tokenizer(reader);
      tokenizer.setLanguage(language);
      tokenizer.setNormFunctions(normFunctions);
      ArrayList<Token> tokens = tokenizer.getTokens();
      if (tokens != null) {
        retTokens = new ArrayList<String>();
        for (int i=0; i<tokens.size(); i++) {
          Token t = tokens.get(i);
          String tokenStr = t.getContentOrig();
          if (useNormFunction(normFunctions))
            tokenStr = t.getContentNorm();
          retTokens.add(tokenStr);
        }
      }
      tokenizer.end();
      tokenizer.close();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return retTokens;
  }

  private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) {
    StringBuilder result = new StringBuilder();
    result.append("<result>");
    result.append("<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>");
    result.append("<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>");
    if (tokens != null && ! tokens.isEmpty()) {
      result.append("<tokens>");
      for (int i=0; i<tokens.size(); i++) {
        String token = tokens.get(i);
        result.append("<token>");
        result.append("<name>" + token + "</name>");
        if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) {
          ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token);
          if (tokenDictionaries != null) {
            result.append("<dictionaries>");
            for (int j=0; j<tokenDictionaries.size(); j++) {
              Lexicon lexicon = tokenDictionaries.get(j);
              result.append(lexicon.toXmlString());
            }
            result.append("</dictionaries>");
          }
        }
        result.append("</token>");
      }
      result.append("</tokens>");
    }
    result.append("</result>");
    return result.toString();
  }  
  
  private String createStringOutputString(ArrayList<String> tokens) {
    StringBuilder result = new StringBuilder();
    if (tokens != null && ! tokens.isEmpty()) {
      for (int i=0; i<tokens.size(); i++) {
        String token = tokens.get(i);
        result.append(token + " ");
      }
      result.setLength(result.length() - 1); // without last blank
    }
    return result.toString();
  }  

  private boolean useNormFunction(String[] normFunctions) {
    boolean useNorm = false;
    for (int i=0; i< normFunctions.length; i++) {
      String function = normFunctions[i];
      if (function.equals("norm"))
        return true;
    }
    return useNorm;
  }

}