Mercurial > hg > mpdl-group
view software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.servlets.lt; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.StringReader; import java.net.URL; import java.util.ArrayList; import java.util.Hashtable; import javax.servlet.ServletConfig; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.io.IOUtils; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; public class Tokenize extends HttpServlet { private static final long serialVersionUID = 1L; public Tokenize() { super(); } public void init(ServletConfig config) throws ServletException { super.init(config); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { doGet(request, response); } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("utf-8"); response.setCharacterEncoding("utf-8"); String inputString = request.getParameter("inputString"); String srcUrlStr = request.getParameter("srcUrl"); String language = request.getParameter("language"); String normalization = request.getParameter("normalization"); String stopElements = request.getParameter("stopElements"); String elements = request.getParameter("elements"); String highlightTerms = request.getParameter("highlightTerms"); String outputFormat = request.getParameter("outputFormat"); String outputOptionsStr = request.getParameter("outputOptions"); if (language == null) language = "eng"; if (normalization == null) normalization = "norm"; String[] normFunctions = normalization.split(" "); String[] stopElementsArray = null; if (stopElements != null && ! stopElements.equals("")) stopElementsArray = stopElements.split(" "); String[] elementsArray = null; if (elements != null && ! elements.equals("")) elementsArray = elements.split(" "); if (highlightTerms == null) highlightTerms = ""; String[] highlightTermsArray = highlightTerms.split(" "); if (outputFormat == null) outputFormat = "xml"; if (outputOptionsStr == null) outputOptionsStr = ""; String[] outputOptions = outputOptionsStr.split(" "); String result = null; try { if (outputFormat.equals("xml")) { response.setContentType("text/xml"); } else if (outputFormat.equals("string")) { response.setContentType("text/html"); } else { response.setContentType("text/xml"); } response.setCharacterEncoding("utf-8"); PrintWriter out = response.getWriter(); String inputText = null; // contains string or xml text if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) { out.print("request parameter \"inputString\" or \"srcUrl\" is empty. Please specify \"inputString\""); out.close(); return; } else { if (srcUrlStr != null && ! srcUrlStr.isEmpty()) { URL srcUrl = new URL(srcUrlStr); InputStream inputStream = srcUrl.openStream(); BufferedInputStream in = new BufferedInputStream(inputStream); inputText = IOUtils.toString(in, "utf-8"); in.close(); } else if (inputString != null && ! inputString.isEmpty()) { inputText = inputString; } } inputText = inputText.trim(); // Tokenize boolean inputTextIsXml = false; if (inputText != null && inputText.startsWith("<") && inputText.endsWith(">")) // TODO check properly for xml type of the inputText inputTextIsXml = true; if (! inputTextIsXml) { inputText = "<result>" + inputText + "</result>"; } StringReader xmlInputStringReader = new StringReader(inputText); XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); xmlTokenizer.setDocIdentifier(srcUrlStr); // TODO xmlTokenizer.setLanguage(language); xmlTokenizer.setNormFunctions(normFunctions); xmlTokenizer.setOutputFormat(outputFormat); xmlTokenizer.setOutputOptions(outputOptions); if (stopElementsArray != null && stopElementsArray.length > 0) xmlTokenizer.setStopElements(stopElementsArray); if (elementsArray != null && elementsArray.length > 0) xmlTokenizer.setElements(elementsArray); if (highlightTermsArray != null && highlightTermsArray.length > 0) xmlTokenizer.setHighlightTerms(highlightTermsArray); xmlTokenizer.tokenize(); if (outputFormat != null && outputFormat.equals("xml")) { result = xmlTokenizer.getXmlResult(); } else { // outputFormat == string result = xmlTokenizer.getStringResult(); } if (result != null) out.print(result); out.close(); } catch (ApplicationException e) { throw new ServletException(e); } } private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { ArrayList<String> retTokens = null; try { StringReader reader = new StringReader(inputString); Tokenizer tokenizer = new Tokenizer(reader); tokenizer.setLanguage(language); tokenizer.setNormFunctions(normFunctions); ArrayList<Token> tokens = tokenizer.getTokens(); if (tokens != null) { retTokens = new ArrayList<String>(); for (int i=0; i<tokens.size(); i++) { Token t = tokens.get(i); String tokenStr = t.getContentOrig(); if (useNormFunction(normFunctions)) tokenStr = t.getContentNorm(); retTokens.add(tokenStr); } } tokenizer.end(); tokenizer.close(); } catch (IOException e) { throw new ApplicationException(e); } return retTokens; } private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) { StringBuilder result = new StringBuilder(); result.append("<result>"); result.append("<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>"); result.append("<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>"); if (tokens != null && ! tokens.isEmpty()) { result.append("<tokens>"); for (int i=0; i<tokens.size(); i++) { String token = tokens.get(i); result.append("<token>"); result.append("<name>" + token + "</name>"); if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) { ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token); if (tokenDictionaries != null) { result.append("<dictionaries>"); for (int j=0; j<tokenDictionaries.size(); j++) { Lexicon lexicon = tokenDictionaries.get(j); result.append(lexicon.toXmlString()); } result.append("</dictionaries>"); } } result.append("</token>"); } result.append("</tokens>"); } result.append("</result>"); return result.toString(); } private String createStringOutputString(ArrayList<String> tokens) { StringBuilder result = new StringBuilder(); if (tokens != null && ! tokens.isEmpty()) { for (int i=0; i<tokens.size(); i++) { String token = tokens.get(i); result.append(token + " "); } result.setLength(result.length() - 1); // without last blank } return result.toString(); } private boolean useNormFunction(String[] normFunctions) { boolean useNorm = false; for (int i=0; i< normFunctions.length; i++) { String function = normFunctions[i]; if (function.equals("norm")) return true; } return useNorm; } }