comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children fba5577e49d9
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.util;
2
3 import java.io.UnsupportedEncodingException;
4 import java.net.URLEncoder;
5 import java.text.CharacterIterator;
6 import java.text.StringCharacterIterator;
7 import java.util.regex.Matcher;
8 import java.util.regex.Pattern;
9
10 public class StringUtilEscapeChars {
11 public static String deleteSpecialXmlEntities(String inputStr) {
12 inputStr = inputStr.replaceAll("&lt;", "");
13 inputStr = inputStr.replaceAll("&gt;", "");
14 inputStr = inputStr.replaceAll("&amp;lt;", "");
15 inputStr = inputStr.replaceAll("&amp;gt;", "");
16 return inputStr;
17 }
18
19 public static String resolveXmlEntities(String inputStr) {
20 inputStr = inputStr.replaceAll("&amp;", "&");
21 inputStr = inputStr.replaceAll("&lt;", "<");
22 inputStr = inputStr.replaceAll("&gt;", ">");
23 inputStr = inputStr.replaceAll("&quot;", "\"");
24 inputStr = inputStr.replaceAll("&apos;", "'");
25 return inputStr;
26 }
27
28 public static String deresolveXmlEntities(String inputStr) {
29 StringBuffer buf = new StringBuffer();
30 for (int i = 0; i < inputStr.length(); i++) {
31 char c = inputStr.charAt(i);
32 String replace = new String();
33 switch (c) {
34 case '&': replace = "&amp;"; break;
35 case '<': replace = "&lt;"; break;
36 case '>': replace = "&gt;"; break;
37 case '"': replace = "&quot;"; break;
38 // case '\'': replace = "&apos;"; break; // causes problems in DictionarizerContentHandler
39 default: replace += c; break;
40 }
41 buf.append(replace);
42 }
43 return buf.toString();
44 }
45
46 /**
47 * Escape characters for text appearing in HTML markup.
48 *
49 * <P>This method exists as a defence against Cross Site Scripting (XSS) hacks.
50 * The idea is to neutralize control characters commonly used by scripts, such that
51 * they will not be executed by the browser. This is done by replacing the control
52 * characters with their escaped equivalents.
53 * See {@link hirondelle.web4j.security.SafeText} as well.
54 *
55 * <P>The following characters are replaced with corresponding
56 * HTML character entities :
57 * <table border='1' cellpadding='3' cellspacing='0'>
58 * <tr><th> Character </th><th>Replacement</th></tr>
59 * <tr><td> < </td><td> &lt; </td></tr>
60 * <tr><td> > </td><td> &gt; </td></tr>
61 * <tr><td> & </td><td> &amp; </td></tr>
62 * <tr><td> " </td><td> &quot;</td></tr>
63 * <tr><td> \t </td><td> &#009;</td></tr>
64 * <tr><td> ! </td><td> &#033;</td></tr>
65 * <tr><td> # </td><td> &#035;</td></tr>
66 * <tr><td> $ </td><td> &#036;</td></tr>
67 * <tr><td> % </td><td> &#037;</td></tr>
68 * <tr><td> ' </td><td> &#039;</td></tr>
69 * <tr><td> ( </td><td> &#040;</td></tr>
70 * <tr><td> ) </td><td> &#041;</td></tr>
71 * <tr><td> * </td><td> &#042;</td></tr>
72 * <tr><td> + </td><td> &#043; </td></tr>
73 * <tr><td> , </td><td> &#044; </td></tr>
74 * <tr><td> - </td><td> &#045; </td></tr>
75 * <tr><td> . </td><td> &#046; </td></tr>
76 * <tr><td> / </td><td> &#047; </td></tr>
77 * <tr><td> : </td><td> &#058;</td></tr>
78 * <tr><td> ; </td><td> &#059;</td></tr>
79 * <tr><td> = </td><td> &#061;</td></tr>
80 * <tr><td> ? </td><td> &#063;</td></tr>
81 * <tr><td> @ </td><td> &#064;</td></tr>
82 * <tr><td> [ </td><td> &#091;</td></tr>
83 * <tr><td> \ </td><td> &#092;</td></tr>
84 * <tr><td> ] </td><td> &#093;</td></tr>
85 * <tr><td> ^ </td><td> &#094;</td></tr>
86 * <tr><td> _ </td><td> &#095;</td></tr>
87 * <tr><td> ` </td><td> &#096;</td></tr>
88 * <tr><td> { </td><td> &#123;</td></tr>
89 * <tr><td> | </td><td> &#124;</td></tr>
90 * <tr><td> } </td><td> &#125;</td></tr>
91 * <tr><td> ~ </td><td> &#126;</td></tr>
92 * </table>
93 *
94 * <P>Note that JSTL's {@code <c:out>} escapes <em>only the first
95 * five</em> of the above characters.
96 */
97 public static String forHTML(String aText){
98 final StringBuilder result = new StringBuilder();
99 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
100 char character = iterator.current();
101 while (character != CharacterIterator.DONE ){
102 if (character == '<') {
103 result.append("&lt;");
104 }
105 else if (character == '>') {
106 result.append("&gt;");
107 }
108 else if (character == '&') {
109 result.append("&amp;");
110 }
111 else if (character == '\"') {
112 result.append("&quot;");
113 }
114 else if (character == '\t') {
115 addCharEntity(9, result);
116 }
117 else if (character == '!') {
118 addCharEntity(33, result);
119 }
120 else if (character == '#') {
121 addCharEntity(35, result);
122 }
123 else if (character == '$') {
124 addCharEntity(36, result);
125 }
126 else if (character == '%') {
127 addCharEntity(37, result);
128 }
129 else if (character == '\'') {
130 addCharEntity(39, result);
131 }
132 else if (character == '(') {
133 addCharEntity(40, result);
134 }
135 else if (character == ')') {
136 addCharEntity(41, result);
137 }
138 else if (character == '*') {
139 addCharEntity(42, result);
140 }
141 else if (character == '+') {
142 addCharEntity(43, result);
143 }
144 else if (character == ',') {
145 addCharEntity(44, result);
146 }
147 else if (character == '-') {
148 addCharEntity(45, result);
149 }
150 else if (character == '.') {
151 addCharEntity(46, result);
152 }
153 else if (character == '/') {
154 addCharEntity(47, result);
155 }
156 else if (character == ':') {
157 addCharEntity(58, result);
158 }
159 else if (character == ';') {
160 addCharEntity(59, result);
161 }
162 else if (character == '=') {
163 addCharEntity(61, result);
164 }
165 else if (character == '?') {
166 addCharEntity(63, result);
167 }
168 else if (character == '@') {
169 addCharEntity(64, result);
170 }
171 else if (character == '[') {
172 addCharEntity(91, result);
173 }
174 else if (character == '\\') {
175 addCharEntity(92, result);
176 }
177 else if (character == ']') {
178 addCharEntity(93, result);
179 }
180 else if (character == '^') {
181 addCharEntity(94, result);
182 }
183 else if (character == '_') {
184 addCharEntity(95, result);
185 }
186 else if (character == '`') {
187 addCharEntity(96, result);
188 }
189 else if (character == '{') {
190 addCharEntity(123, result);
191 }
192 else if (character == '|') {
193 addCharEntity(124, result);
194 }
195 else if (character == '}') {
196 addCharEntity(125, result);
197 }
198 else if (character == '~') {
199 addCharEntity(126, result);
200 }
201 else {
202 //the char is not a special one
203 //add it to the result as is
204 result.append(character);
205 }
206 character = iterator.next();
207 }
208 return result.toString();
209 }
210
211
212 /**
213 * Escape all ampersand characters in a URL.
214 *
215 * <P>Replaces all <tt>'&'</tt> characters with <tt>'&amp;'</tt>.
216 *
217 *<P>An ampersand character may appear in the query string of a URL.
218 * The ampersand character is indeed valid in a URL.
219 * <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and
220 * such attributes have the additional constraint that ampersands
221 * must be escaped.</em>
222 *
223 * <P>The JSTL <c:url> tag does indeed perform proper URL encoding of
224 * query parameters. But it does not, in general, produce text which
225 * is valid as an <tt>HREF</tt> attribute, simply because it does
226 * not escape the ampersand character. This is a nuisance when
227 * multiple query parameters appear in the URL, since it requires a little
228 * extra work.
229 */
230 public static String forHrefAmpersand(String aURL){
231 return aURL.replace("&", "&amp;");
232 }
233
234 /**
235 * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>.
236 *
237 * <P>Used to ensure that HTTP query strings are in proper form, by escaping
238 * special characters such as spaces.
239 *
240 * <P>It is important to note that if a query string appears in an <tt>HREF</tt>
241 * attribute, then there are two issues - ensuring the query string is valid HTTP
242 * (it is URL-encoded), and ensuring it is valid HTML (ensuring the
243 * ampersand is escaped).
244 */
245 public static String forURL(String aURLFragment){
246 String result = null;
247 try {
248 result = URLEncoder.encode(aURLFragment, "UTF-8");
249 }
250 catch (UnsupportedEncodingException ex){
251 throw new RuntimeException("UTF-8 not supported", ex);
252 }
253 return result;
254 }
255
256 /**
257 * Escape characters for text appearing as XML data, between tags.
258 *
259 * <P>The following characters are replaced with corresponding character entities :
260 * <table border='1' cellpadding='3' cellspacing='0'>
261 * <tr><th> Character </th><th> Encoding </th></tr>
262 * <tr><td> < </td><td> &lt; </td></tr>
263 * <tr><td> > </td><td> &gt; </td></tr>
264 * <tr><td> & </td><td> &amp; </td></tr>
265 * <tr><td> " </td><td> &quot;</td></tr>
266 * <tr><td> ' </td><td> &#039;</td></tr>
267 * </table>
268 *
269 * <P>Note that JSTL's {@code <c:out>} escapes the exact same set of
270 * characters as this method. <span class='highlight'>That is, {@code <c:out>}
271 * is good for escaping to produce valid XML, but not for producing safe
272 * HTML.</span>
273 */
274 public static String forXML(String aText){
275 final StringBuilder result = new StringBuilder();
276 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
277 char character = iterator.current();
278 while (character != CharacterIterator.DONE ){
279 if (character == '<') {
280 result.append("&lt;");
281 }
282 else if (character == '>') {
283 result.append("&gt;");
284 }
285 else if (character == '\"') {
286 result.append("&quot;");
287 }
288 else if (character == '\'') {
289 result.append("&#039;");
290 }
291 else if (character == '&') {
292 result.append("&amp;");
293 }
294 else {
295 //the char is not a special one
296 //add it to the result as is
297 result.append(character);
298 }
299 character = iterator.next();
300 }
301 return result.toString();
302 }
303
304 /**
305 * Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters
306 * replaced by their escaped equivalents.
307 */
308 public static String toDisableTags(String aText){
309 final StringBuilder result = new StringBuilder();
310 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
311 char character = iterator.current();
312 while (character != CharacterIterator.DONE ){
313 if (character == '<') {
314 result.append("&lt;");
315 }
316 else if (character == '>') {
317 result.append("&gt;");
318 }
319 else {
320 //the char is not a special one
321 //add it to the result as is
322 result.append(character);
323 }
324 character = iterator.next();
325 }
326 return result.toString();
327 }
328
329
330 /**
331 * Replace characters having special meaning in regular expressions
332 * with their escaped equivalents, preceded by a '\' character.
333 *
334 * <P>The escaped characters include :
335 *<ul>
336 *<li>.
337 *<li>\
338 *<li>?, * , and +
339 *<li>&
340 *<li>:
341 *<li>{ and }
342 *<li>[ and ]
343 *<li>( and )
344 *<li>^ and $
345 *</ul>
346 */
347 public static String forRegex(String aRegexFragment){
348 final StringBuilder result = new StringBuilder();
349
350 final StringCharacterIterator iterator =
351 new StringCharacterIterator(aRegexFragment)
352 ;
353 char character = iterator.current();
354 while (character != CharacterIterator.DONE ){
355 /*
356 * All literals need to have backslashes doubled.
357 */
358 if (character == '.') {
359 result.append("\\.");
360 }
361 else if (character == '\\') {
362 result.append("\\\\");
363 }
364 else if (character == '?') {
365 result.append("\\?");
366 }
367 else if (character == '*') {
368 result.append("\\*");
369 }
370 else if (character == '+') {
371 result.append("\\+");
372 }
373 else if (character == '&') {
374 result.append("\\&");
375 }
376 else if (character == ':') {
377 result.append("\\:");
378 }
379 else if (character == '{') {
380 result.append("\\{");
381 }
382 else if (character == '}') {
383 result.append("\\}");
384 }
385 else if (character == '[') {
386 result.append("\\[");
387 }
388 else if (character == ']') {
389 result.append("\\]");
390 }
391 else if (character == '(') {
392 result.append("\\(");
393 }
394 else if (character == ')') {
395 result.append("\\)");
396 }
397 else if (character == '^') {
398 result.append("\\^");
399 }
400 else if (character == '$') {
401 result.append("\\$");
402 }
403 else {
404 //the char is not a special one
405 //add it to the result as is
406 result.append(character);
407 }
408 character = iterator.next();
409 }
410 return result.toString();
411 }
412
413 /**
414 * Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings.
415 *
416 * <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>.
417 *
418 * <P>The following methods use replacement strings which treat
419 * <tt>'$'</tt> and <tt>'\'</tt> as special characters:
420 * <ul>
421 * <li><tt>String.replaceAll(String, String)</tt>
422 * <li><tt>String.replaceFirst(String, String)</tt>
423 * <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt>
424 * </ul>
425 *
426 * <P>If replacement text can contain arbitrary characters, then you
427 * will usually need to escape that text, to ensure special characters
428 * are interpreted literally.
429 */
430 public static String forReplacementString(String aInput){
431 return Matcher.quoteReplacement(aInput);
432 }
433
434 /**
435 * Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>.
436 *
437 * <P>Insensitive to case.
438 */
439 public static String forScriptTagsOnly(String aText){
440 String result = null;
441 Matcher matcher = SCRIPT.matcher(aText);
442 result = matcher.replaceAll("&lt;SCRIPT>");
443 matcher = SCRIPT_END.matcher(result);
444 result = matcher.replaceAll("&lt;/SCRIPT>");
445 return result;
446 }
447
448 // PRIVATE //
449
450 private StringUtilEscapeChars(){
451 //empty - prevent construction
452 }
453
454 private static final Pattern SCRIPT = Pattern.compile(
455 "<SCRIPT>", Pattern.CASE_INSENSITIVE
456 );
457 private static final Pattern SCRIPT_END = Pattern.compile(
458 "</SCRIPT>", Pattern.CASE_INSENSITIVE
459 );
460
461 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){
462 String padding = "";
463 if( aIdx <= 9 ){
464 padding = "00";
465 }
466 else if( aIdx <= 99 ){
467 padding = "0";
468 }
469 else {
470 //no prefix
471 }
472 String number = padding + aIdx.toString();
473 aBuilder.append("&#" + number + ";");
474 }
475 }