Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | fba5577e49d9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.util; | |
2 | |
3 import java.io.UnsupportedEncodingException; | |
4 import java.net.URLEncoder; | |
5 import java.text.CharacterIterator; | |
6 import java.text.StringCharacterIterator; | |
7 import java.util.regex.Matcher; | |
8 import java.util.regex.Pattern; | |
9 | |
10 public class StringUtilEscapeChars { | |
11 public static String deleteSpecialXmlEntities(String inputStr) { | |
12 inputStr = inputStr.replaceAll("<", ""); | |
13 inputStr = inputStr.replaceAll(">", ""); | |
14 inputStr = inputStr.replaceAll("&lt;", ""); | |
15 inputStr = inputStr.replaceAll("&gt;", ""); | |
16 return inputStr; | |
17 } | |
18 | |
19 public static String resolveXmlEntities(String inputStr) { | |
20 inputStr = inputStr.replaceAll("&", "&"); | |
21 inputStr = inputStr.replaceAll("<", "<"); | |
22 inputStr = inputStr.replaceAll(">", ">"); | |
23 inputStr = inputStr.replaceAll(""", "\""); | |
24 inputStr = inputStr.replaceAll("'", "'"); | |
25 return inputStr; | |
26 } | |
27 | |
28 public static String deresolveXmlEntities(String inputStr) { | |
29 StringBuffer buf = new StringBuffer(); | |
30 for (int i = 0; i < inputStr.length(); i++) { | |
31 char c = inputStr.charAt(i); | |
32 String replace = new String(); | |
33 switch (c) { | |
34 case '&': replace = "&"; break; | |
35 case '<': replace = "<"; break; | |
36 case '>': replace = ">"; break; | |
37 case '"': replace = """; break; | |
38 // case '\'': replace = "'"; break; // causes problems in DictionarizerContentHandler | |
39 default: replace += c; break; | |
40 } | |
41 buf.append(replace); | |
42 } | |
43 return buf.toString(); | |
44 } | |
45 | |
46 /** | |
47 * Escape characters for text appearing in HTML markup. | |
48 * | |
49 * <P>This method exists as a defence against Cross Site Scripting (XSS) hacks. | |
50 * The idea is to neutralize control characters commonly used by scripts, such that | |
51 * they will not be executed by the browser. This is done by replacing the control | |
52 * characters with their escaped equivalents. | |
53 * See {@link hirondelle.web4j.security.SafeText} as well. | |
54 * | |
55 * <P>The following characters are replaced with corresponding | |
56 * HTML character entities : | |
57 * <table border='1' cellpadding='3' cellspacing='0'> | |
58 * <tr><th> Character </th><th>Replacement</th></tr> | |
59 * <tr><td> < </td><td> < </td></tr> | |
60 * <tr><td> > </td><td> > </td></tr> | |
61 * <tr><td> & </td><td> & </td></tr> | |
62 * <tr><td> " </td><td> "</td></tr> | |
63 * <tr><td> \t </td><td> 	</td></tr> | |
64 * <tr><td> ! </td><td> !</td></tr> | |
65 * <tr><td> # </td><td> #</td></tr> | |
66 * <tr><td> $ </td><td> $</td></tr> | |
67 * <tr><td> % </td><td> %</td></tr> | |
68 * <tr><td> ' </td><td> '</td></tr> | |
69 * <tr><td> ( </td><td> (</td></tr> | |
70 * <tr><td> ) </td><td> )</td></tr> | |
71 * <tr><td> * </td><td> *</td></tr> | |
72 * <tr><td> + </td><td> + </td></tr> | |
73 * <tr><td> , </td><td> , </td></tr> | |
74 * <tr><td> - </td><td> - </td></tr> | |
75 * <tr><td> . </td><td> . </td></tr> | |
76 * <tr><td> / </td><td> / </td></tr> | |
77 * <tr><td> : </td><td> :</td></tr> | |
78 * <tr><td> ; </td><td> ;</td></tr> | |
79 * <tr><td> = </td><td> =</td></tr> | |
80 * <tr><td> ? </td><td> ?</td></tr> | |
81 * <tr><td> @ </td><td> @</td></tr> | |
82 * <tr><td> [ </td><td> [</td></tr> | |
83 * <tr><td> \ </td><td> \</td></tr> | |
84 * <tr><td> ] </td><td> ]</td></tr> | |
85 * <tr><td> ^ </td><td> ^</td></tr> | |
86 * <tr><td> _ </td><td> _</td></tr> | |
87 * <tr><td> ` </td><td> `</td></tr> | |
88 * <tr><td> { </td><td> {</td></tr> | |
89 * <tr><td> | </td><td> |</td></tr> | |
90 * <tr><td> } </td><td> }</td></tr> | |
91 * <tr><td> ~ </td><td> ~</td></tr> | |
92 * </table> | |
93 * | |
94 * <P>Note that JSTL's {@code <c:out>} escapes <em>only the first | |
95 * five</em> of the above characters. | |
96 */ | |
97 public static String forHTML(String aText){ | |
98 final StringBuilder result = new StringBuilder(); | |
99 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
100 char character = iterator.current(); | |
101 while (character != CharacterIterator.DONE ){ | |
102 if (character == '<') { | |
103 result.append("<"); | |
104 } | |
105 else if (character == '>') { | |
106 result.append(">"); | |
107 } | |
108 else if (character == '&') { | |
109 result.append("&"); | |
110 } | |
111 else if (character == '\"') { | |
112 result.append("""); | |
113 } | |
114 else if (character == '\t') { | |
115 addCharEntity(9, result); | |
116 } | |
117 else if (character == '!') { | |
118 addCharEntity(33, result); | |
119 } | |
120 else if (character == '#') { | |
121 addCharEntity(35, result); | |
122 } | |
123 else if (character == '$') { | |
124 addCharEntity(36, result); | |
125 } | |
126 else if (character == '%') { | |
127 addCharEntity(37, result); | |
128 } | |
129 else if (character == '\'') { | |
130 addCharEntity(39, result); | |
131 } | |
132 else if (character == '(') { | |
133 addCharEntity(40, result); | |
134 } | |
135 else if (character == ')') { | |
136 addCharEntity(41, result); | |
137 } | |
138 else if (character == '*') { | |
139 addCharEntity(42, result); | |
140 } | |
141 else if (character == '+') { | |
142 addCharEntity(43, result); | |
143 } | |
144 else if (character == ',') { | |
145 addCharEntity(44, result); | |
146 } | |
147 else if (character == '-') { | |
148 addCharEntity(45, result); | |
149 } | |
150 else if (character == '.') { | |
151 addCharEntity(46, result); | |
152 } | |
153 else if (character == '/') { | |
154 addCharEntity(47, result); | |
155 } | |
156 else if (character == ':') { | |
157 addCharEntity(58, result); | |
158 } | |
159 else if (character == ';') { | |
160 addCharEntity(59, result); | |
161 } | |
162 else if (character == '=') { | |
163 addCharEntity(61, result); | |
164 } | |
165 else if (character == '?') { | |
166 addCharEntity(63, result); | |
167 } | |
168 else if (character == '@') { | |
169 addCharEntity(64, result); | |
170 } | |
171 else if (character == '[') { | |
172 addCharEntity(91, result); | |
173 } | |
174 else if (character == '\\') { | |
175 addCharEntity(92, result); | |
176 } | |
177 else if (character == ']') { | |
178 addCharEntity(93, result); | |
179 } | |
180 else if (character == '^') { | |
181 addCharEntity(94, result); | |
182 } | |
183 else if (character == '_') { | |
184 addCharEntity(95, result); | |
185 } | |
186 else if (character == '`') { | |
187 addCharEntity(96, result); | |
188 } | |
189 else if (character == '{') { | |
190 addCharEntity(123, result); | |
191 } | |
192 else if (character == '|') { | |
193 addCharEntity(124, result); | |
194 } | |
195 else if (character == '}') { | |
196 addCharEntity(125, result); | |
197 } | |
198 else if (character == '~') { | |
199 addCharEntity(126, result); | |
200 } | |
201 else { | |
202 //the char is not a special one | |
203 //add it to the result as is | |
204 result.append(character); | |
205 } | |
206 character = iterator.next(); | |
207 } | |
208 return result.toString(); | |
209 } | |
210 | |
211 | |
212 /** | |
213 * Escape all ampersand characters in a URL. | |
214 * | |
215 * <P>Replaces all <tt>'&'</tt> characters with <tt>'&'</tt>. | |
216 * | |
217 *<P>An ampersand character may appear in the query string of a URL. | |
218 * The ampersand character is indeed valid in a URL. | |
219 * <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and | |
220 * such attributes have the additional constraint that ampersands | |
221 * must be escaped.</em> | |
222 * | |
223 * <P>The JSTL <c:url> tag does indeed perform proper URL encoding of | |
224 * query parameters. But it does not, in general, produce text which | |
225 * is valid as an <tt>HREF</tt> attribute, simply because it does | |
226 * not escape the ampersand character. This is a nuisance when | |
227 * multiple query parameters appear in the URL, since it requires a little | |
228 * extra work. | |
229 */ | |
230 public static String forHrefAmpersand(String aURL){ | |
231 return aURL.replace("&", "&"); | |
232 } | |
233 | |
234 /** | |
235 * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>. | |
236 * | |
237 * <P>Used to ensure that HTTP query strings are in proper form, by escaping | |
238 * special characters such as spaces. | |
239 * | |
240 * <P>It is important to note that if a query string appears in an <tt>HREF</tt> | |
241 * attribute, then there are two issues - ensuring the query string is valid HTTP | |
242 * (it is URL-encoded), and ensuring it is valid HTML (ensuring the | |
243 * ampersand is escaped). | |
244 */ | |
245 public static String forURL(String aURLFragment){ | |
246 String result = null; | |
247 try { | |
248 result = URLEncoder.encode(aURLFragment, "UTF-8"); | |
249 } | |
250 catch (UnsupportedEncodingException ex){ | |
251 throw new RuntimeException("UTF-8 not supported", ex); | |
252 } | |
253 return result; | |
254 } | |
255 | |
256 /** | |
257 * Escape characters for text appearing as XML data, between tags. | |
258 * | |
259 * <P>The following characters are replaced with corresponding character entities : | |
260 * <table border='1' cellpadding='3' cellspacing='0'> | |
261 * <tr><th> Character </th><th> Encoding </th></tr> | |
262 * <tr><td> < </td><td> < </td></tr> | |
263 * <tr><td> > </td><td> > </td></tr> | |
264 * <tr><td> & </td><td> & </td></tr> | |
265 * <tr><td> " </td><td> "</td></tr> | |
266 * <tr><td> ' </td><td> '</td></tr> | |
267 * </table> | |
268 * | |
269 * <P>Note that JSTL's {@code <c:out>} escapes the exact same set of | |
270 * characters as this method. <span class='highlight'>That is, {@code <c:out>} | |
271 * is good for escaping to produce valid XML, but not for producing safe | |
272 * HTML.</span> | |
273 */ | |
274 public static String forXML(String aText){ | |
275 final StringBuilder result = new StringBuilder(); | |
276 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
277 char character = iterator.current(); | |
278 while (character != CharacterIterator.DONE ){ | |
279 if (character == '<') { | |
280 result.append("<"); | |
281 } | |
282 else if (character == '>') { | |
283 result.append(">"); | |
284 } | |
285 else if (character == '\"') { | |
286 result.append("""); | |
287 } | |
288 else if (character == '\'') { | |
289 result.append("'"); | |
290 } | |
291 else if (character == '&') { | |
292 result.append("&"); | |
293 } | |
294 else { | |
295 //the char is not a special one | |
296 //add it to the result as is | |
297 result.append(character); | |
298 } | |
299 character = iterator.next(); | |
300 } | |
301 return result.toString(); | |
302 } | |
303 | |
304 /** | |
305 * Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters | |
306 * replaced by their escaped equivalents. | |
307 */ | |
308 public static String toDisableTags(String aText){ | |
309 final StringBuilder result = new StringBuilder(); | |
310 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
311 char character = iterator.current(); | |
312 while (character != CharacterIterator.DONE ){ | |
313 if (character == '<') { | |
314 result.append("<"); | |
315 } | |
316 else if (character == '>') { | |
317 result.append(">"); | |
318 } | |
319 else { | |
320 //the char is not a special one | |
321 //add it to the result as is | |
322 result.append(character); | |
323 } | |
324 character = iterator.next(); | |
325 } | |
326 return result.toString(); | |
327 } | |
328 | |
329 | |
330 /** | |
331 * Replace characters having special meaning in regular expressions | |
332 * with their escaped equivalents, preceded by a '\' character. | |
333 * | |
334 * <P>The escaped characters include : | |
335 *<ul> | |
336 *<li>. | |
337 *<li>\ | |
338 *<li>?, * , and + | |
339 *<li>& | |
340 *<li>: | |
341 *<li>{ and } | |
342 *<li>[ and ] | |
343 *<li>( and ) | |
344 *<li>^ and $ | |
345 *</ul> | |
346 */ | |
347 public static String forRegex(String aRegexFragment){ | |
348 final StringBuilder result = new StringBuilder(); | |
349 | |
350 final StringCharacterIterator iterator = | |
351 new StringCharacterIterator(aRegexFragment) | |
352 ; | |
353 char character = iterator.current(); | |
354 while (character != CharacterIterator.DONE ){ | |
355 /* | |
356 * All literals need to have backslashes doubled. | |
357 */ | |
358 if (character == '.') { | |
359 result.append("\\."); | |
360 } | |
361 else if (character == '\\') { | |
362 result.append("\\\\"); | |
363 } | |
364 else if (character == '?') { | |
365 result.append("\\?"); | |
366 } | |
367 else if (character == '*') { | |
368 result.append("\\*"); | |
369 } | |
370 else if (character == '+') { | |
371 result.append("\\+"); | |
372 } | |
373 else if (character == '&') { | |
374 result.append("\\&"); | |
375 } | |
376 else if (character == ':') { | |
377 result.append("\\:"); | |
378 } | |
379 else if (character == '{') { | |
380 result.append("\\{"); | |
381 } | |
382 else if (character == '}') { | |
383 result.append("\\}"); | |
384 } | |
385 else if (character == '[') { | |
386 result.append("\\["); | |
387 } | |
388 else if (character == ']') { | |
389 result.append("\\]"); | |
390 } | |
391 else if (character == '(') { | |
392 result.append("\\("); | |
393 } | |
394 else if (character == ')') { | |
395 result.append("\\)"); | |
396 } | |
397 else if (character == '^') { | |
398 result.append("\\^"); | |
399 } | |
400 else if (character == '$') { | |
401 result.append("\\$"); | |
402 } | |
403 else { | |
404 //the char is not a special one | |
405 //add it to the result as is | |
406 result.append(character); | |
407 } | |
408 character = iterator.next(); | |
409 } | |
410 return result.toString(); | |
411 } | |
412 | |
413 /** | |
414 * Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings. | |
415 * | |
416 * <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>. | |
417 * | |
418 * <P>The following methods use replacement strings which treat | |
419 * <tt>'$'</tt> and <tt>'\'</tt> as special characters: | |
420 * <ul> | |
421 * <li><tt>String.replaceAll(String, String)</tt> | |
422 * <li><tt>String.replaceFirst(String, String)</tt> | |
423 * <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt> | |
424 * </ul> | |
425 * | |
426 * <P>If replacement text can contain arbitrary characters, then you | |
427 * will usually need to escape that text, to ensure special characters | |
428 * are interpreted literally. | |
429 */ | |
430 public static String forReplacementString(String aInput){ | |
431 return Matcher.quoteReplacement(aInput); | |
432 } | |
433 | |
434 /** | |
435 * Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>. | |
436 * | |
437 * <P>Insensitive to case. | |
438 */ | |
439 public static String forScriptTagsOnly(String aText){ | |
440 String result = null; | |
441 Matcher matcher = SCRIPT.matcher(aText); | |
442 result = matcher.replaceAll("<SCRIPT>"); | |
443 matcher = SCRIPT_END.matcher(result); | |
444 result = matcher.replaceAll("</SCRIPT>"); | |
445 return result; | |
446 } | |
447 | |
448 // PRIVATE // | |
449 | |
450 private StringUtilEscapeChars(){ | |
451 //empty - prevent construction | |
452 } | |
453 | |
454 private static final Pattern SCRIPT = Pattern.compile( | |
455 "<SCRIPT>", Pattern.CASE_INSENSITIVE | |
456 ); | |
457 private static final Pattern SCRIPT_END = Pattern.compile( | |
458 "</SCRIPT>", Pattern.CASE_INSENSITIVE | |
459 ); | |
460 | |
461 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ | |
462 String padding = ""; | |
463 if( aIdx <= 9 ){ | |
464 padding = "00"; | |
465 } | |
466 else if( aIdx <= 99 ){ | |
467 padding = "0"; | |
468 } | |
469 else { | |
470 //no prefix | |
471 } | |
472 String number = padding + aIdx.toString(); | |
473 aBuilder.append("&#" + number + ";"); | |
474 } | |
475 } |