Mercurial > hg > mpdl-group
comparison software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/lt/Tokenize.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:e845310098ba | 25:e9fe3186670c |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.servlets.lt; | |
2 | |
3 import java.io.BufferedInputStream; | |
4 import java.io.IOException; | |
5 import java.io.InputStream; | |
6 import java.io.PrintWriter; | |
7 import java.io.StringReader; | |
8 import java.net.URL; | |
9 import java.util.ArrayList; | |
10 import java.util.Hashtable; | |
11 | |
12 import javax.servlet.ServletConfig; | |
13 import javax.servlet.ServletException; | |
14 import javax.servlet.http.HttpServlet; | |
15 import javax.servlet.http.HttpServletRequest; | |
16 import javax.servlet.http.HttpServletResponse; | |
17 | |
18 import org.apache.commons.io.IOUtils; | |
19 | |
20 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
21 import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; | |
22 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; | |
23 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; | |
24 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; | |
25 | |
26 public class Tokenize extends HttpServlet { | |
27 private static final long serialVersionUID = 1L; | |
28 | |
29 public Tokenize() { | |
30 super(); | |
31 } | |
32 | |
33 public void init(ServletConfig config) throws ServletException { | |
34 super.init(config); | |
35 } | |
36 | |
37 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { | |
38 doGet(request, response); | |
39 } | |
40 | |
41 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { | |
42 request.setCharacterEncoding("utf-8"); | |
43 response.setCharacterEncoding("utf-8"); | |
44 String inputString = request.getParameter("inputString"); | |
45 String srcUrlStr = request.getParameter("srcUrl"); | |
46 String language = request.getParameter("language"); | |
47 String normalization = request.getParameter("normalization"); | |
48 String stopElements = request.getParameter("stopElements"); | |
49 String elements = request.getParameter("elements"); | |
50 String highlightTerms = request.getParameter("highlightTerms"); | |
51 String outputFormat = request.getParameter("outputFormat"); | |
52 String outputOptionsStr = request.getParameter("outputOptions"); | |
53 if (language == null) | |
54 language = "eng"; | |
55 if (normalization == null) | |
56 normalization = "norm"; | |
57 String[] normFunctions = normalization.split(" "); | |
58 String[] stopElementsArray = null; | |
59 if (stopElements != null && ! stopElements.equals("")) | |
60 stopElementsArray = stopElements.split(" "); | |
61 String[] elementsArray = null; | |
62 if (elements != null && ! elements.equals("")) | |
63 elementsArray = elements.split(" "); | |
64 if (highlightTerms == null) | |
65 highlightTerms = ""; | |
66 String[] highlightTermsArray = highlightTerms.split(" "); | |
67 if (outputFormat == null) | |
68 outputFormat = "xml"; | |
69 if (outputOptionsStr == null) | |
70 outputOptionsStr = ""; | |
71 String[] outputOptions = outputOptionsStr.split(" "); | |
72 String result = null; | |
73 try { | |
74 if (outputFormat.equals("xml")) { | |
75 response.setContentType("text/xml"); | |
76 } else if (outputFormat.equals("string")) { | |
77 response.setContentType("text/html"); | |
78 } else { | |
79 response.setContentType("text/xml"); | |
80 } | |
81 response.setCharacterEncoding("utf-8"); | |
82 PrintWriter out = response.getWriter(); | |
83 String inputText = null; // contains string or xml text | |
84 if ((inputString == null || inputString.isEmpty()) && (srcUrlStr == null || srcUrlStr.isEmpty())) { | |
85 out.print("request parameter \"inputString\" or \"srcUrl\" is empty. Please specify \"inputString\""); | |
86 out.close(); | |
87 return; | |
88 } else { | |
89 if (srcUrlStr != null && ! srcUrlStr.isEmpty()) { | |
90 URL srcUrl = new URL(srcUrlStr); | |
91 InputStream inputStream = srcUrl.openStream(); | |
92 BufferedInputStream in = new BufferedInputStream(inputStream); | |
93 inputText = IOUtils.toString(in, "utf-8"); | |
94 in.close(); | |
95 } else if (inputString != null && ! inputString.isEmpty()) { | |
96 inputText = inputString; | |
97 } | |
98 } | |
99 inputText = inputText.trim(); | |
100 // Tokenize | |
101 boolean inputTextIsXml = false; | |
102 if (inputText != null && inputText.startsWith("<") && inputText.endsWith(">")) // TODO check properly for xml type of the inputText | |
103 inputTextIsXml = true; | |
104 if (! inputTextIsXml) { | |
105 inputText = "<result>" + inputText + "</result>"; | |
106 } | |
107 StringReader xmlInputStringReader = new StringReader(inputText); | |
108 XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); | |
109 xmlTokenizer.setDocIdentifier(srcUrlStr); // TODO | |
110 xmlTokenizer.setLanguage(language); | |
111 xmlTokenizer.setNormFunctions(normFunctions); | |
112 xmlTokenizer.setOutputFormat(outputFormat); | |
113 xmlTokenizer.setOutputOptions(outputOptions); | |
114 if (stopElementsArray != null && stopElementsArray.length > 0) | |
115 xmlTokenizer.setStopElements(stopElementsArray); | |
116 if (elementsArray != null && elementsArray.length > 0) | |
117 xmlTokenizer.setElements(elementsArray); | |
118 if (highlightTermsArray != null && highlightTermsArray.length > 0) | |
119 xmlTokenizer.setHighlightTerms(highlightTermsArray); | |
120 xmlTokenizer.tokenize(); | |
121 if (outputFormat != null && outputFormat.equals("xml")) { | |
122 result = xmlTokenizer.getXmlResult(); | |
123 } else { // outputFormat == string | |
124 result = xmlTokenizer.getStringResult(); | |
125 } | |
126 if (result != null) | |
127 out.print(result); | |
128 out.close(); | |
129 } catch (ApplicationException e) { | |
130 throw new ServletException(e); | |
131 } | |
132 } | |
133 | |
134 private ArrayList<String> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { | |
135 ArrayList<String> retTokens = null; | |
136 try { | |
137 StringReader reader = new StringReader(inputString); | |
138 Tokenizer tokenizer = new Tokenizer(reader); | |
139 tokenizer.setLanguage(language); | |
140 tokenizer.setNormFunctions(normFunctions); | |
141 ArrayList<Token> tokens = tokenizer.getTokens(); | |
142 if (tokens != null) { | |
143 retTokens = new ArrayList<String>(); | |
144 for (int i=0; i<tokens.size(); i++) { | |
145 Token t = tokens.get(i); | |
146 String tokenStr = t.getContentOrig(); | |
147 if (useNormFunction(normFunctions)) | |
148 tokenStr = t.getContentNorm(); | |
149 retTokens.add(tokenStr); | |
150 } | |
151 } | |
152 tokenizer.end(); | |
153 tokenizer.close(); | |
154 } catch (IOException e) { | |
155 throw new ApplicationException(e); | |
156 } | |
157 return retTokens; | |
158 } | |
159 | |
160 private String createXmlOutputString(ArrayList<String> tokens, Hashtable<String, ArrayList<Lexicon>> tokensDictionaries, String baseUrl, String elapsedTime) { | |
161 StringBuilder result = new StringBuilder(); | |
162 result.append("<result>"); | |
163 result.append("<provider>" + "MPIWG MPDL language technology service (see: " + "" + baseUrl + "), Max Planck Institute for the History of Science, Berlin." + "</provider>"); | |
164 result.append("<elapsed-time-ms>" + elapsedTime + "</elapsed-time-ms>"); | |
165 if (tokens != null && ! tokens.isEmpty()) { | |
166 result.append("<tokens>"); | |
167 for (int i=0; i<tokens.size(); i++) { | |
168 String token = tokens.get(i); | |
169 result.append("<token>"); | |
170 result.append("<name>" + token + "</name>"); | |
171 if (tokensDictionaries != null && ! tokensDictionaries.isEmpty()) { | |
172 ArrayList<Lexicon> tokenDictionaries = tokensDictionaries.get(token); | |
173 if (tokenDictionaries != null) { | |
174 result.append("<dictionaries>"); | |
175 for (int j=0; j<tokenDictionaries.size(); j++) { | |
176 Lexicon lexicon = tokenDictionaries.get(j); | |
177 result.append(lexicon.toXmlString()); | |
178 } | |
179 result.append("</dictionaries>"); | |
180 } | |
181 } | |
182 result.append("</token>"); | |
183 } | |
184 result.append("</tokens>"); | |
185 } | |
186 result.append("</result>"); | |
187 return result.toString(); | |
188 } | |
189 | |
190 private String createStringOutputString(ArrayList<String> tokens) { | |
191 StringBuilder result = new StringBuilder(); | |
192 if (tokens != null && ! tokens.isEmpty()) { | |
193 for (int i=0; i<tokens.size(); i++) { | |
194 String token = tokens.get(i); | |
195 result.append(token + " "); | |
196 } | |
197 result.setLength(result.length() - 1); // without last blank | |
198 } | |
199 return result.toString(); | |
200 } | |
201 | |
202 private boolean useNormFunction(String[] normFunctions) { | |
203 boolean useNorm = false; | |
204 for (int i=0; i< normFunctions.length; i++) { | |
205 String function = normFunctions[i]; | |
206 if (function.equals("norm")) | |
207 return true; | |
208 } | |
209 return useNorm; | |
210 } | |
211 | |
212 } |