Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.util; | |
2 | |
3 import java.io.UnsupportedEncodingException; | |
4 import java.net.URLEncoder; | |
5 import java.text.CharacterIterator; | |
6 import java.text.StringCharacterIterator; | |
7 import java.util.regex.Matcher; | |
8 import java.util.regex.Pattern; | |
9 | |
10 public class StringUtils { | |
11 | |
12 /** | |
13 * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) | |
14 * @param str | |
15 * @return | |
16 */ | |
17 public static String zwsp(String str) { | |
18 // based on Unicode 3.2 | |
19 String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; | |
20 String regex = "(" + ideographic + ")(" + ideographic + ")"; | |
21 String retStr = str.replaceAll(regex, "$1\u200b$2"); | |
22 retStr = retStr.replaceAll(regex, "$1\u200b$2"); | |
23 return retStr; | |
24 } | |
25 | |
26 | |
27 public static String deleteSpecialXmlEntities(String inputStr) { | |
28 inputStr = inputStr.replaceAll("<", ""); | |
29 inputStr = inputStr.replaceAll(">", ""); | |
30 inputStr = inputStr.replaceAll("&lt;", ""); | |
31 inputStr = inputStr.replaceAll("&gt;", ""); | |
32 return inputStr; | |
33 } | |
34 | |
35 public static String resolveXmlEntities(String inputStr) { | |
36 inputStr = inputStr.replaceAll("&", "&"); | |
37 inputStr = inputStr.replaceAll("<", "<"); | |
38 inputStr = inputStr.replaceAll(">", ">"); | |
39 inputStr = inputStr.replaceAll(""", "\""); | |
40 inputStr = inputStr.replaceAll("'", "'"); | |
41 return inputStr; | |
42 } | |
43 | |
44 public static String deresolveXmlEntities(String inputStr) { | |
45 StringBuffer buf = new StringBuffer(); | |
46 for (int i = 0; i < inputStr.length(); i++) { | |
47 char c = inputStr.charAt(i); | |
48 String replace = new String(); | |
49 switch (c) { | |
50 case '&': replace = "&"; break; | |
51 case '<': replace = "<"; break; | |
52 case '>': replace = ">"; break; | |
53 case '"': replace = """; break; | |
54 // case '\'': replace = "'"; break; // causes problems in DictionarizerContentHandler | |
55 default: replace += c; break; | |
56 } | |
57 buf.append(replace); | |
58 } | |
59 return buf.toString(); | |
60 } | |
61 | |
62 /** | |
63 * Escape characters for text appearing in HTML markup. | |
64 * | |
65 * <P>This method exists as a defence against Cross Site Scripting (XSS) hacks. | |
66 * The idea is to neutralize control characters commonly used by scripts, such that | |
67 * they will not be executed by the browser. This is done by replacing the control | |
68 * characters with their escaped equivalents. | |
69 * See {@link hirondelle.web4j.security.SafeText} as well. | |
70 * | |
71 * <P>The following characters are replaced with corresponding | |
72 * HTML character entities : | |
73 * <table border='1' cellpadding='3' cellspacing='0'> | |
74 * <tr><th> Character </th><th>Replacement</th></tr> | |
75 * <tr><td> < </td><td> < </td></tr> | |
76 * <tr><td> > </td><td> > </td></tr> | |
77 * <tr><td> & </td><td> & </td></tr> | |
78 * <tr><td> " </td><td> "</td></tr> | |
79 * <tr><td> \t </td><td> 	</td></tr> | |
80 * <tr><td> ! </td><td> !</td></tr> | |
81 * <tr><td> # </td><td> #</td></tr> | |
82 * <tr><td> $ </td><td> $</td></tr> | |
83 * <tr><td> % </td><td> %</td></tr> | |
84 * <tr><td> ' </td><td> '</td></tr> | |
85 * <tr><td> ( </td><td> (</td></tr> | |
86 * <tr><td> ) </td><td> )</td></tr> | |
87 * <tr><td> * </td><td> *</td></tr> | |
88 * <tr><td> + </td><td> + </td></tr> | |
89 * <tr><td> , </td><td> , </td></tr> | |
90 * <tr><td> - </td><td> - </td></tr> | |
91 * <tr><td> . </td><td> . </td></tr> | |
92 * <tr><td> / </td><td> / </td></tr> | |
93 * <tr><td> : </td><td> :</td></tr> | |
94 * <tr><td> ; </td><td> ;</td></tr> | |
95 * <tr><td> = </td><td> =</td></tr> | |
96 * <tr><td> ? </td><td> ?</td></tr> | |
97 * <tr><td> @ </td><td> @</td></tr> | |
98 * <tr><td> [ </td><td> [</td></tr> | |
99 * <tr><td> \ </td><td> \</td></tr> | |
100 * <tr><td> ] </td><td> ]</td></tr> | |
101 * <tr><td> ^ </td><td> ^</td></tr> | |
102 * <tr><td> _ </td><td> _</td></tr> | |
103 * <tr><td> ` </td><td> `</td></tr> | |
104 * <tr><td> { </td><td> {</td></tr> | |
105 * <tr><td> | </td><td> |</td></tr> | |
106 * <tr><td> } </td><td> }</td></tr> | |
107 * <tr><td> ~ </td><td> ~</td></tr> | |
108 * </table> | |
109 * | |
110 * <P>Note that JSTL's {@code <c:out>} escapes <em>only the first | |
111 * five</em> of the above characters. | |
112 */ | |
113 public static String forHTML(String aText){ | |
114 final StringBuilder result = new StringBuilder(); | |
115 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
116 char character = iterator.current(); | |
117 while (character != CharacterIterator.DONE ){ | |
118 if (character == '<') { | |
119 result.append("<"); | |
120 } | |
121 else if (character == '>') { | |
122 result.append(">"); | |
123 } | |
124 else if (character == '&') { | |
125 result.append("&"); | |
126 } | |
127 else if (character == '\"') { | |
128 result.append("""); | |
129 } | |
130 else if (character == '\t') { | |
131 addCharEntity(9, result); | |
132 } | |
133 else if (character == '!') { | |
134 addCharEntity(33, result); | |
135 } | |
136 else if (character == '#') { | |
137 addCharEntity(35, result); | |
138 } | |
139 else if (character == '$') { | |
140 addCharEntity(36, result); | |
141 } | |
142 else if (character == '%') { | |
143 addCharEntity(37, result); | |
144 } | |
145 else if (character == '\'') { | |
146 addCharEntity(39, result); | |
147 } | |
148 else if (character == '(') { | |
149 addCharEntity(40, result); | |
150 } | |
151 else if (character == ')') { | |
152 addCharEntity(41, result); | |
153 } | |
154 else if (character == '*') { | |
155 addCharEntity(42, result); | |
156 } | |
157 else if (character == '+') { | |
158 addCharEntity(43, result); | |
159 } | |
160 else if (character == ',') { | |
161 addCharEntity(44, result); | |
162 } | |
163 else if (character == '-') { | |
164 addCharEntity(45, result); | |
165 } | |
166 else if (character == '.') { | |
167 addCharEntity(46, result); | |
168 } | |
169 else if (character == '/') { | |
170 addCharEntity(47, result); | |
171 } | |
172 else if (character == ':') { | |
173 addCharEntity(58, result); | |
174 } | |
175 else if (character == ';') { | |
176 addCharEntity(59, result); | |
177 } | |
178 else if (character == '=') { | |
179 addCharEntity(61, result); | |
180 } | |
181 else if (character == '?') { | |
182 addCharEntity(63, result); | |
183 } | |
184 else if (character == '@') { | |
185 addCharEntity(64, result); | |
186 } | |
187 else if (character == '[') { | |
188 addCharEntity(91, result); | |
189 } | |
190 else if (character == '\\') { | |
191 addCharEntity(92, result); | |
192 } | |
193 else if (character == ']') { | |
194 addCharEntity(93, result); | |
195 } | |
196 else if (character == '^') { | |
197 addCharEntity(94, result); | |
198 } | |
199 else if (character == '_') { | |
200 addCharEntity(95, result); | |
201 } | |
202 else if (character == '`') { | |
203 addCharEntity(96, result); | |
204 } | |
205 else if (character == '{') { | |
206 addCharEntity(123, result); | |
207 } | |
208 else if (character == '|') { | |
209 addCharEntity(124, result); | |
210 } | |
211 else if (character == '}') { | |
212 addCharEntity(125, result); | |
213 } | |
214 else if (character == '~') { | |
215 addCharEntity(126, result); | |
216 } | |
217 else { | |
218 //the char is not a special one | |
219 //add it to the result as is | |
220 result.append(character); | |
221 } | |
222 character = iterator.next(); | |
223 } | |
224 return result.toString(); | |
225 } | |
226 | |
227 | |
228 /** | |
229 * Escape all ampersand characters in a URL. | |
230 * | |
231 * <P>Replaces all <tt>'&'</tt> characters with <tt>'&'</tt>. | |
232 * | |
233 *<P>An ampersand character may appear in the query string of a URL. | |
234 * The ampersand character is indeed valid in a URL. | |
235 * <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and | |
236 * such attributes have the additional constraint that ampersands | |
237 * must be escaped.</em> | |
238 * | |
239 * <P>The JSTL <c:url> tag does indeed perform proper URL encoding of | |
240 * query parameters. But it does not, in general, produce text which | |
241 * is valid as an <tt>HREF</tt> attribute, simply because it does | |
242 * not escape the ampersand character. This is a nuisance when | |
243 * multiple query parameters appear in the URL, since it requires a little | |
244 * extra work. | |
245 */ | |
246 public static String forHrefAmpersand(String aURL){ | |
247 return aURL.replace("&", "&"); | |
248 } | |
249 | |
250 /** | |
251 * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>. | |
252 * | |
253 * <P>Used to ensure that HTTP query strings are in proper form, by escaping | |
254 * special characters such as spaces. | |
255 * | |
256 * <P>It is important to note that if a query string appears in an <tt>HREF</tt> | |
257 * attribute, then there are two issues - ensuring the query string is valid HTTP | |
258 * (it is URL-encoded), and ensuring it is valid HTML (ensuring the | |
259 * ampersand is escaped). | |
260 */ | |
261 public static String forURL(String aURLFragment){ | |
262 String result = null; | |
263 try { | |
264 result = URLEncoder.encode(aURLFragment, "UTF-8"); | |
265 } | |
266 catch (UnsupportedEncodingException ex){ | |
267 throw new RuntimeException("UTF-8 not supported", ex); | |
268 } | |
269 return result; | |
270 } | |
271 | |
272 /** | |
273 * Escape characters for text appearing as XML data, between tags. | |
274 * | |
275 * <P>The following characters are replaced with corresponding character entities : | |
276 * <table border='1' cellpadding='3' cellspacing='0'> | |
277 * <tr><th> Character </th><th> Encoding </th></tr> | |
278 * <tr><td> < </td><td> < </td></tr> | |
279 * <tr><td> > </td><td> > </td></tr> | |
280 * <tr><td> & </td><td> & </td></tr> | |
281 * <tr><td> " </td><td> "</td></tr> | |
282 * <tr><td> ' </td><td> '</td></tr> | |
283 * </table> | |
284 * | |
285 * <P>Note that JSTL's {@code <c:out>} escapes the exact same set of | |
286 * characters as this method. <span class='highlight'>That is, {@code <c:out>} | |
287 * is good for escaping to produce valid XML, but not for producing safe | |
288 * HTML.</span> | |
289 */ | |
290 public static String forXML(String aText){ | |
291 final StringBuilder result = new StringBuilder(); | |
292 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
293 char character = iterator.current(); | |
294 while (character != CharacterIterator.DONE ){ | |
295 if (character == '<') { | |
296 result.append("<"); | |
297 } | |
298 else if (character == '>') { | |
299 result.append(">"); | |
300 } | |
301 else if (character == '\"') { | |
302 result.append("""); | |
303 } | |
304 else if (character == '\'') { | |
305 result.append("'"); | |
306 } | |
307 else if (character == '&') { | |
308 result.append("&"); | |
309 } | |
310 else { | |
311 //the char is not a special one | |
312 //add it to the result as is | |
313 result.append(character); | |
314 } | |
315 character = iterator.next(); | |
316 } | |
317 return result.toString(); | |
318 } | |
319 | |
320 /** | |
321 * Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters | |
322 * replaced by their escaped equivalents. | |
323 */ | |
324 public static String toDisableTags(String aText){ | |
325 final StringBuilder result = new StringBuilder(); | |
326 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
327 char character = iterator.current(); | |
328 while (character != CharacterIterator.DONE ){ | |
329 if (character == '<') { | |
330 result.append("<"); | |
331 } | |
332 else if (character == '>') { | |
333 result.append(">"); | |
334 } | |
335 else { | |
336 //the char is not a special one | |
337 //add it to the result as is | |
338 result.append(character); | |
339 } | |
340 character = iterator.next(); | |
341 } | |
342 return result.toString(); | |
343 } | |
344 | |
345 | |
346 /** | |
347 * Replace characters having special meaning in regular expressions | |
348 * with their escaped equivalents, preceded by a '\' character. | |
349 * | |
350 * <P>The escaped characters include : | |
351 *<ul> | |
352 *<li>. | |
353 *<li>\ | |
354 *<li>?, * , and + | |
355 *<li>& | |
356 *<li>: | |
357 *<li>{ and } | |
358 *<li>[ and ] | |
359 *<li>( and ) | |
360 *<li>^ and $ | |
361 *</ul> | |
362 */ | |
363 public static String forRegex(String aRegexFragment){ | |
364 final StringBuilder result = new StringBuilder(); | |
365 | |
366 final StringCharacterIterator iterator = | |
367 new StringCharacterIterator(aRegexFragment) | |
368 ; | |
369 char character = iterator.current(); | |
370 while (character != CharacterIterator.DONE ){ | |
371 /* | |
372 * All literals need to have backslashes doubled. | |
373 */ | |
374 if (character == '.') { | |
375 result.append("\\."); | |
376 } | |
377 else if (character == '\\') { | |
378 result.append("\\\\"); | |
379 } | |
380 else if (character == '?') { | |
381 result.append("\\?"); | |
382 } | |
383 else if (character == '*') { | |
384 result.append("\\*"); | |
385 } | |
386 else if (character == '+') { | |
387 result.append("\\+"); | |
388 } | |
389 else if (character == '&') { | |
390 result.append("\\&"); | |
391 } | |
392 else if (character == ':') { | |
393 result.append("\\:"); | |
394 } | |
395 else if (character == '{') { | |
396 result.append("\\{"); | |
397 } | |
398 else if (character == '}') { | |
399 result.append("\\}"); | |
400 } | |
401 else if (character == '[') { | |
402 result.append("\\["); | |
403 } | |
404 else if (character == ']') { | |
405 result.append("\\]"); | |
406 } | |
407 else if (character == '(') { | |
408 result.append("\\("); | |
409 } | |
410 else if (character == ')') { | |
411 result.append("\\)"); | |
412 } | |
413 else if (character == '^') { | |
414 result.append("\\^"); | |
415 } | |
416 else if (character == '$') { | |
417 result.append("\\$"); | |
418 } | |
419 else { | |
420 //the char is not a special one | |
421 //add it to the result as is | |
422 result.append(character); | |
423 } | |
424 character = iterator.next(); | |
425 } | |
426 return result.toString(); | |
427 } | |
428 | |
429 /** | |
430 * Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings. | |
431 * | |
432 * <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>. | |
433 * | |
434 * <P>The following methods use replacement strings which treat | |
435 * <tt>'$'</tt> and <tt>'\'</tt> as special characters: | |
436 * <ul> | |
437 * <li><tt>String.replaceAll(String, String)</tt> | |
438 * <li><tt>String.replaceFirst(String, String)</tt> | |
439 * <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt> | |
440 * </ul> | |
441 * | |
442 * <P>If replacement text can contain arbitrary characters, then you | |
443 * will usually need to escape that text, to ensure special characters | |
444 * are interpreted literally. | |
445 */ | |
446 public static String forReplacementString(String aInput){ | |
447 return Matcher.quoteReplacement(aInput); | |
448 } | |
449 | |
450 /** | |
451 * Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>. | |
452 * | |
453 * <P>Insensitive to case. | |
454 */ | |
455 public static String forScriptTagsOnly(String aText){ | |
456 String result = null; | |
457 Matcher matcher = SCRIPT.matcher(aText); | |
458 result = matcher.replaceAll("<SCRIPT>"); | |
459 matcher = SCRIPT_END.matcher(result); | |
460 result = matcher.replaceAll("</SCRIPT>"); | |
461 return result; | |
462 } | |
463 | |
464 // PRIVATE // | |
465 | |
466 private StringUtils(){ | |
467 //empty - prevent construction | |
468 } | |
469 | |
470 private static final Pattern SCRIPT = Pattern.compile( | |
471 "<SCRIPT>", Pattern.CASE_INSENSITIVE | |
472 ); | |
473 private static final Pattern SCRIPT_END = Pattern.compile( | |
474 "</SCRIPT>", Pattern.CASE_INSENSITIVE | |
475 ); | |
476 | |
477 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ | |
478 String padding = ""; | |
479 if( aIdx <= 9 ){ | |
480 padding = "00"; | |
481 } | |
482 else if( aIdx <= 99 ){ | |
483 padding = "0"; | |
484 } | |
485 else { | |
486 //no prefix | |
487 } | |
488 String number = padding + aIdx.toString(); | |
489 aBuilder.append("&#" + number + ";"); | |
490 } | |
491 } |