Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 4ea0f81a5d08 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,233 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.lt; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; +import java.util.Hashtable; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.io.IOUtils; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; +import de.mpg.mpiwg.berlin.mpdl.servlets.util.ServletUtil; + +public class Tokenize extends HttpServlet { + private static final long serialVersionUID = 1L; + + public Tokenize() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + Date begin = new Date(); + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String inputString = request.getParameter("inputString"); + String srcUrlStr = request.getParameter("srcUrl"); + String language = request.getParameter("language"); + String normalization = request.getParameter("normalization"); + String dictionary = request.getParameter("dictionary"); + String stopElements = request.getParameter("stopElements"); + String outputFormat = request.getParameter("outputFormat"); + String outputOptionsStr = request.getParameter("outputOptions"); + if (language == null) + language = "eng"; + if (normalization == null) + normalization = "norm"; + String[] normFunctions = normalization.split(" "); + if (dictionary == null) + dictionary = "yes"; + if (stopElements == null) + stopElements = ""; + String[] stopElementsArray = stopElements.split(" "); + if (outputFormat == null) + outputFormat = "xml"; + if (outputOptionsStr == null) + outputOptionsStr = ""; + String[] outputOptions = outputOptionsStr.split(" "); + String result = null; + try { + if (outputFormat.equals("xml")) { + response.setContentType("text/xml"); + } else if (outputFormat.equals("string")) { + response.setContentType("text/html"); + } else { + response.setContentType("text/xml"); + } + response.setCharacterEncoding("utf-8"); + PrintWriter out = response.getWriter(); + String inputText = null; // contains string or xml text + if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) { + out.print("request parameter \"inputString\" or \"srcUrl\" is empty. Please specify \"inputString\""); + out.close(); + return; + } else { + if (srcUrlStr != null && ! srcUrlStr.isEmpty()) { + URL srcUrl = new URL(srcUrlStr); + InputStream inputStream = srcUrl.openStream(); + BufferedInputStream in = new BufferedInputStream(inputStream); + inputText = IOUtils.toString(in, "utf-8"); + in.close(); + } else if (inputString != null && ! inputString.isEmpty()) { + inputText = inputString; + } + } + inputText = inputText.trim(); + // Tokenize + boolean inputTextIsXml = false; + if (inputText != null && inputText.startsWith("<") && inputText.endsWith(">")) // TODO check properly for xml type of the inputText + inputTextIsXml = true; + if (! inputTextIsXml) { + ArrayList<String> tokens = getToken(inputText, language, normFunctions); + Hashtable<String, ArrayList<Lexicon>> tokensDictionaries = null; + if (dictionary.equals("yes")) { + tokensDictionaries = new Hashtable<String, ArrayList<Lexicon>>(); + LexHandler lexHandler = LexHandler.getInstance(); + for (int i = 0; i < tokens.size(); i++) { + String token = tokens.get(i); + ArrayList<Lemma> lemmas = lexHandler.getLemmas(token, "form", language, "none"); + ArrayList<Lexicon> dictionaries = lexHandler.getLexEntries(lemmas, language, null); + tokensDictionaries.put(token, dictionaries); + } + } + String baseUrl = ServletUtil.getInstance().getBaseUrl(request); + Date end = new Date(); + String elapsedTime = String.valueOf(end.getTime() - begin.getTime()); + if (outputFormat.equals("xml")) + result = createXmlOutputString(tokens, tokensDictionaries, baseUrl, elapsedTime); + else if (outputFormat.equals("string")) + result = createStringOutputString(tokens); + else + result = "<result><error>outputFormat: \"" + outputFormat + "\" is not supported</error></result>"; + } else { + StringReader xmlInputStringReader = new StringReader(inputText); + XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); + xmlTokenizer.setLanguage(language); + xmlTokenizer.setNormFunctions(normFunctions); + xmlTokenizer.setOutputOptions(outputOptions); + if (stopElementsArray != null) + xmlTokenizer.setStopElements(stopElementsArray); + result = xmlTokenizer.tokenize(); + } + if (result != null) + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { + ArrayList<String> retTokens = null; + try { + StringReader reader = new StringReader(inputString); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normFunctions); + ArrayList<Token> tokens = tokenizer.getTokens(); + if (tokens != null) { + retTokens = new ArrayList<String>(); + for (int i=0; i<tokens.size(); i++) { + Token t = tokens.get(i); + retTokens.add(t.getContent()); + } + } + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retTokens; + } + + private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) { + String result = "<result>"; + result = result + "<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>"; + result = result + "<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>"; + if (tokens != null && ! tokens.isEmpty()) { + result = result + "<tokens>"; + for (int i=0; i<tokens.size(); i++) { + String token = tokens.get(i); + result = result + "<token>"; + result = result + "<name>" + token + "</name>"; + if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) { + ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token); + if (tokenDictionaries != null) { + result = result + "<dictionaries>"; + for (int j=0; j<tokenDictionaries.size(); j++) { + Lexicon lexicon = tokenDictionaries.get(j); + result = result + lexicon.toXmlString(); + } + result = result + "</dictionaries>"; + } + } + result = result + "</token>"; + } + result = result + "</tokens>"; + } + result = result + "</result>"; + return result; + } + + private String createStringOutputString(ArrayList<String> tokens) { + String result = ""; + if (tokens != null && ! tokens.isEmpty()) { + for (int i=0; i<tokens.size(); i++) { + String token = tokens.get(i); + result = result + token + " "; + } + result = result.substring(0, result.length() - 1); // without last blank + } + return result; + } + + private ArrayList<String> getTokenOld(String inputString, String language, String[] normFunctions) throws ApplicationException { + ArrayList<String> tokens = new ArrayList<String>(); + try { + StringReader reader = new StringReader(inputString); + Tokenizer tokenizer = new Tokenizer(reader); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normFunctions); + // tokenizer.reset(); + /* + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + */ + CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); + // Token token = tokenizer.getAttribute(Token.class); + while (tokenizer.incrementToken()) { + // String tokenStr = token.toString(); + String term = charTermAttribute.toString(); + tokens.add(term); + } + tokenizer.end(); + tokenizer.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + return tokens; + } + +}