Mercurial > hg > mpdl-group
diff software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java Tue May 21 10:19:32 2013 +0200 @@ -0,0 +1,212 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Hashtable; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.io.IOUtils; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; + +public class Tokenize extends HttpServlet { + private static final long serialVersionUID = 1L; + + public Tokenize() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + doGet(request, response); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String inputString = request.getParameter("inputString"); + String srcUrlStr = request.getParameter("srcUrl"); + String language = request.getParameter("language"); + String normalization = request.getParameter("normalization"); + String stopElements = request.getParameter("stopElements"); + String elements = request.getParameter("elements"); + String highlightTerms = request.getParameter("highlightTerms"); + String outputFormat = request.getParameter("outputFormat"); + String outputOptionsStr = request.getParameter("outputOptions"); + if (language == null) + language = "eng"; + if (normalization == null) + normalization = "norm"; + String[] normFunctions = normalization.split(" "); + String[] stopElementsArray = null; + if (stopElements != null && ! stopElements.equals("")) + stopElementsArray = stopElements.split(" "); + String[] elementsArray = null; + if (elements != null && ! elements.equals("")) + elementsArray = elements.split(" "); + if (highlightTerms == null) + highlightTerms = ""; + String[] highlightTermsArray = highlightTerms.split(" "); + if (outputFormat == null) + outputFormat = "xml"; + if (outputOptionsStr == null) + outputOptionsStr = ""; + String[] outputOptions = outputOptionsStr.split(" "); + String result = null; + try { + if (outputFormat.equals("xml")) { + response.setContentType("text/xml"); + } else if (outputFormat.equals("string")) { + response.setContentType("text/html"); + } else { + response.setContentType("text/xml"); + } + response.setCharacterEncoding("utf-8"); + PrintWriter out = response.getWriter(); + String inputText = null; // contains string or xml text + if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) { + out.print("request parameter \"inputString\" or \"srcUrl\" is empty. Please specify \"inputString\""); + out.close(); + return; + } else { + if (srcUrlStr != null && ! srcUrlStr.isEmpty()) { + URL srcUrl = new URL(srcUrlStr); + InputStream inputStream = srcUrl.openStream(); + BufferedInputStream in = new BufferedInputStream(inputStream); + inputText = IOUtils.toString(in, "utf-8"); + in.close(); + } else if (inputString != null && ! inputString.isEmpty()) { + inputText = inputString; + } + } + inputText = inputText.trim(); + // Tokenize + boolean inputTextIsXml = false; + if (inputText != null && inputText.startsWith("<") && inputText.endsWith(">")) // TODO check properly for xml type of the inputText + inputTextIsXml = true; + if (! inputTextIsXml) { + inputText = "<result>" + inputText + "</result>"; + } + StringReader xmlInputStringReader = new StringReader(inputText); + XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); + xmlTokenizer.setDocIdentifier(srcUrlStr); // TODO + xmlTokenizer.setLanguage(language); + xmlTokenizer.setNormFunctions(normFunctions); + xmlTokenizer.setOutputFormat(outputFormat); + xmlTokenizer.setOutputOptions(outputOptions); + if (stopElementsArray != null && stopElementsArray.length > 0) + xmlTokenizer.setStopElements(stopElementsArray); + if (elementsArray != null && elementsArray.length > 0) + xmlTokenizer.setElements(elementsArray); + if (highlightTermsArray != null && highlightTermsArray.length > 0) + xmlTokenizer.setHighlightTerms(highlightTermsArray); + xmlTokenizer.tokenize(); + if (outputFormat != null && outputFormat.equals("xml")) { + result = xmlTokenizer.getXmlResult(); + } else { // outputFormat == string + result = xmlTokenizer.getStringResult(); + } + if (result != null) + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { + ArrayList<String> retTokens = null; + try { + StringReader reader = new StringReader(inputString); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normFunctions); + ArrayList<Token> tokens = tokenizer.getTokens(); + if (tokens != null) { + retTokens = new ArrayList<String>(); + for (int i=0; i<tokens.size(); i++) { + Token t = tokens.get(i); + String tokenStr = t.getContentOrig(); + if (useNormFunction(normFunctions)) + tokenStr = t.getContentNorm(); + retTokens.add(tokenStr); + } + } + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retTokens; + } + + private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) { + StringBuilder result = new StringBuilder(); + result.append("<result>"); + result.append("<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>"); + result.append("<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>"); + if (tokens != null && ! tokens.isEmpty()) { + result.append("<tokens>"); + for (int i=0; i<tokens.size(); i++) { + String token = tokens.get(i); + result.append("<token>"); + result.append("<name>" + token + "</name>"); + if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) { + ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token); + if (tokenDictionaries != null) { + result.append("<dictionaries>"); + for (int j=0; j<tokenDictionaries.size(); j++) { + Lexicon lexicon = tokenDictionaries.get(j); + result.append(lexicon.toXmlString()); + } + result.append("</dictionaries>"); + } + } + result.append("</token>"); + } + result.append("</tokens>"); + } + result.append("</result>"); + return result.toString(); + } + + private String createStringOutputString(ArrayList<String> tokens) { + StringBuilder result = new StringBuilder(); + if (tokens != null && ! tokens.isEmpty()) { + for (int i=0; i<tokens.size(); i++) { + String token = tokens.get(i); + result.append(token + " "); + } + result.setLength(result.length() - 1); // without last blank + } + return result.toString(); + } + + private boolean useNormFunction(String[] normFunctions) { + boolean useNorm = false; + for (int i=0; i< normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("norm")) + return true; + } + return useNorm; + } + +}