comparison software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.servlets.lt;
2
3 import java.io.BufferedInputStream;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.PrintWriter;
7 import java.io.StringReader;
8 import java.net.URL;
9 import java.util.ArrayList;
10 import java.util.Hashtable;
11
12 import javax.servlet.ServletConfig;
13 import javax.servlet.ServletException;
14 import javax.servlet.http.HttpServlet;
15 import javax.servlet.http.HttpServletRequest;
16 import javax.servlet.http.HttpServletResponse;
17
18 import org.apache.commons.io.IOUtils;
19
20 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
21 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
22 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
23 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer;
24 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
25
26 public class Tokenize extends HttpServlet {
27 private static final long serialVersionUID = 1L;
28
29 public Tokenize() {
30 super();
31 }
32
33 public void init(ServletConfig config) throws ServletException {
34 super.init(config);
35 }
36
37 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
38 doGet(request, response);
39 }
40
41 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
42 request.setCharacterEncoding("utf-8");
43 response.setCharacterEncoding("utf-8");
44 String inputString = request.getParameter("inputString");
45 String srcUrlStr = request.getParameter("srcUrl");
46 String language = request.getParameter("language");
47 String normalization = request.getParameter("normalization");
48 String stopElements = request.getParameter("stopElements");
49 String elements = request.getParameter("elements");
50 String highlightTerms = request.getParameter("highlightTerms");
51 String outputFormat = request.getParameter("outputFormat");
52 String outputOptionsStr = request.getParameter("outputOptions");
53 if (language == null)
54 language = "eng";
55 if (normalization == null)
56 normalization = "norm";
57 String[] normFunctions = normalization.split(" ");
58 String[] stopElementsArray = null;
59 if (stopElements != null && ! stopElements.equals(""))
60 stopElementsArray = stopElements.split(" ");
61 String[] elementsArray = null;
62 if (elements != null && ! elements.equals(""))
63 elementsArray = elements.split(" ");
64 if (highlightTerms == null)
65 highlightTerms = "";
66 String[] highlightTermsArray = highlightTerms.split(" ");
67 if (outputFormat == null)
68 outputFormat = "xml";
69 if (outputOptionsStr == null)
70 outputOptionsStr = "";
71 String[] outputOptions = outputOptionsStr.split(" ");
72 String result = null;
73 try {
74 if (outputFormat.equals("xml")) {
75 response.setContentType("text/xml");
76 } else if (outputFormat.equals("string")) {
77 response.setContentType("text/html");
78 } else {
79 response.setContentType("text/xml");
80 }
81 response.setCharacterEncoding("utf-8");
82 PrintWriter out = response.getWriter();
83 String inputText = null; // contains string or xml text
84 if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) {
85 out.print("request parameter \"inputString\" or \"srcUrl\" is empty. Please specify \"inputString\"");
86 out.close();
87 return;
88 } else {
89 if (srcUrlStr != null && ! srcUrlStr.isEmpty()) {
90 URL srcUrl = new URL(srcUrlStr);
91 InputStream inputStream = srcUrl.openStream();
92 BufferedInputStream in = new BufferedInputStream(inputStream);
93 inputText = IOUtils.toString(in, "utf-8");
94 in.close();
95 } else if (inputString != null && ! inputString.isEmpty()) {
96 inputText = inputString;
97 }
98 }
99 inputText = inputText.trim();
100 // Tokenize
101 boolean inputTextIsXml = false;
102 if (inputText != null && inputText.startsWith("<") && inputText.endsWith(">")) // TODO check properly for xml type of the inputText
103 inputTextIsXml = true;
104 if (! inputTextIsXml) {
105 inputText = "<result>" + inputText + "</result>";
106 }
107 StringReader xmlInputStringReader = new StringReader(inputText);
108 XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
109 xmlTokenizer.setDocIdentifier(srcUrlStr); // TODO
110 xmlTokenizer.setLanguage(language);
111 xmlTokenizer.setNormFunctions(normFunctions);
112 xmlTokenizer.setOutputFormat(outputFormat);
113 xmlTokenizer.setOutputOptions(outputOptions);
114 if (stopElementsArray != null && stopElementsArray.length > 0)
115 xmlTokenizer.setStopElements(stopElementsArray);
116 if (elementsArray != null && elementsArray.length > 0)
117 xmlTokenizer.setElements(elementsArray);
118 if (highlightTermsArray != null && highlightTermsArray.length > 0)
119 xmlTokenizer.setHighlightTerms(highlightTermsArray);
120 xmlTokenizer.tokenize();
121 if (outputFormat != null && outputFormat.equals("xml")) {
122 result = xmlTokenizer.getXmlResult();
123 } else { // outputFormat == string
124 result = xmlTokenizer.getStringResult();
125 }
126 if (result != null)
127 out.print(result);
128 out.close();
129 } catch (ApplicationException e) {
130 throw new ServletException(e);
131 }
132 }
133
134 private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException {
135 ArrayList<String> retTokens = null;
136 try {
137 StringReader reader = new StringReader(inputString);
138 Tokenizer tokenizer = new Tokenizer(reader);
139 tokenizer.setLanguage(language);
140 tokenizer.setNormFunctions(normFunctions);
141 ArrayList<Token> tokens = tokenizer.getTokens();
142 if (tokens != null) {
143 retTokens = new ArrayList<String>();
144 for (int i=0; i<tokens.size(); i++) {
145 Token t = tokens.get(i);
146 String tokenStr = t.getContentOrig();
147 if (useNormFunction(normFunctions))
148 tokenStr = t.getContentNorm();
149 retTokens.add(tokenStr);
150 }
151 }
152 tokenizer.end();
153 tokenizer.close();
154 } catch (IOException e) {
155 throw new ApplicationException(e);
156 }
157 return retTokens;
158 }
159
160 private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) {
161 StringBuilder result = new StringBuilder();
162 result.append("<result>");
163 result.append("<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>");
164 result.append("<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>");
165 if (tokens != null && ! tokens.isEmpty()) {
166 result.append("<tokens>");
167 for (int i=0; i<tokens.size(); i++) {
168 String token = tokens.get(i);
169 result.append("<token>");
170 result.append("<name>" + token + "</name>");
171 if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) {
172 ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token);
173 if (tokenDictionaries != null) {
174 result.append("<dictionaries>");
175 for (int j=0; j<tokenDictionaries.size(); j++) {
176 Lexicon lexicon = tokenDictionaries.get(j);
177 result.append(lexicon.toXmlString());
178 }
179 result.append("</dictionaries>");
180 }
181 }
182 result.append("</token>");
183 }
184 result.append("</tokens>");
185 }
186 result.append("</result>");
187 return result.toString();
188 }
189
190 private String createStringOutputString(ArrayList<String> tokens) {
191 StringBuilder result = new StringBuilder();
192 if (tokens != null && ! tokens.isEmpty()) {
193 for (int i=0; i<tokens.size(); i++) {
194 String token = tokens.get(i);
195 result.append(token + " ");
196 }
197 result.setLength(result.length() - 1); // without last blank
198 }
199 return result.toString();
200 }
201
202 private boolean useNormFunction(String[] normFunctions) {
203 boolean useNorm = false;
204 for (int i=0; i< normFunctions.length; i++) {
205 String function = normFunctions[i];
206 if (function.equals("norm"))
207 return true;
208 }
209 return useNorm;
210 }
211
212 }