Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-xml/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtils.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.util; | |
2 | |
3 import java.io.UnsupportedEncodingException; | |
4 import java.net.URLEncoder; | |
5 import java.text.CharacterIterator; | |
6 import java.text.StringCharacterIterator; | |
7 import java.util.Hashtable; | |
8 import java.util.regex.Matcher; | |
9 import java.util.regex.Pattern; | |
10 | |
11 public class StringUtils { | |
12 private static Pattern xmlEntitiesPattern; | |
13 private static Pattern nlTabBlankPattern; | |
14 private static Hashtable<String, String> xmlEntitiesReplacements; | |
15 static { | |
16 xmlEntitiesPattern = Pattern.compile("&|<|>|"|'"); | |
17 nlTabBlankPattern = Pattern.compile("\n|[ \t]+"); | |
18 xmlEntitiesReplacements = new Hashtable<String, String>(); | |
19 xmlEntitiesReplacements.put("&", "&"); | |
20 xmlEntitiesReplacements.put("<", "<"); | |
21 xmlEntitiesReplacements.put(">", ">"); | |
22 xmlEntitiesReplacements.put(""", "\""); | |
23 xmlEntitiesReplacements.put("'", "'"); | |
24 } | |
25 | |
26 /** | |
27 * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) | |
28 * @param str | |
29 * @return | |
30 */ | |
31 public static String zwsp(String str) { | |
32 // based on Unicode 3.2 | |
33 String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; | |
34 String regex = "(" + ideographic + ")(" + ideographic + ")"; | |
35 String retStr = str.replaceAll(regex, "$1\u200b$2"); | |
36 retStr = retStr.replaceAll(regex, "$1\u200b$2"); | |
37 return retStr; | |
38 } | |
39 | |
40 public static String deleteSpecialXmlEntities(String inputStr) { | |
41 inputStr = inputStr.replaceAll("<", ""); | |
42 inputStr = inputStr.replaceAll(">", ""); | |
43 inputStr = inputStr.replaceAll("&lt;", ""); | |
44 inputStr = inputStr.replaceAll("&gt;", ""); | |
45 return inputStr; | |
46 } | |
47 | |
48 public static String resolveXmlEntities(String inputStr) { | |
49 StringBuffer sb = new StringBuffer(inputStr.length()); | |
50 Matcher m = xmlEntitiesPattern.matcher(inputStr); | |
51 int end = 0; | |
52 while (m.find()) { | |
53 String wordToReplace = m.group(); | |
54 String replacement = xmlEntitiesReplacements.get(wordToReplace); | |
55 m.appendReplacement(sb, replacement); | |
56 end = m.end(); | |
57 } | |
58 sb.append(inputStr.substring(end)); | |
59 return sb.toString(); | |
60 } | |
61 | |
62 public static String removeNlTabBlanks(String inputStr) { | |
63 StringBuffer sb = new StringBuffer(inputStr.length()); | |
64 Matcher m = nlTabBlankPattern.matcher(inputStr); | |
65 int end = 0; | |
66 while (m.find()) { | |
67 String wordToReplace = m.group(); | |
68 if (wordToReplace.contains(" ") || wordToReplace.contains("\t")) | |
69 m.appendReplacement(sb, " "); // make the blanks and Tabs to one blank | |
70 end = m.end(); | |
71 } | |
72 sb.append(inputStr.substring(end)); | |
73 return sb.toString(); | |
74 } | |
75 | |
76 private static String resolveXmlEntities_OldSlowMethod(String inputStr) { | |
77 inputStr = inputStr.replaceAll("&", "&"); | |
78 inputStr = inputStr.replaceAll("<", "<"); | |
79 inputStr = inputStr.replaceAll(">", ">"); | |
80 inputStr = inputStr.replaceAll(""", "\""); | |
81 inputStr = inputStr.replaceAll("'", "'"); | |
82 return inputStr; | |
83 } | |
84 | |
85 public static String deresolveXmlEntities(String inputStr) { | |
86 StringBuilder buf = new StringBuilder(); | |
87 for (int i = 0; i < inputStr.length(); i++) { | |
88 char c = inputStr.charAt(i); | |
89 String replace = new String(); | |
90 switch (c) { | |
91 case '&': replace = "&"; break; | |
92 case '<': replace = "<"; break; | |
93 case '>': replace = ">"; break; | |
94 case '"': replace = """; break; | |
95 // case '\'': replace = "'"; break; // causes problems in XmlTokenizerContentHandler | |
96 default: replace += c; break; | |
97 } | |
98 buf.append(replace); | |
99 } | |
100 return buf.toString(); | |
101 } | |
102 | |
103 /** | |
104 * Escape characters for text appearing in HTML markup. | |
105 * | |
106 * <P>This method exists as a defence against Cross Site Scripting (XSS) hacks. | |
107 * The idea is to neutralize control characters commonly used by scripts, such that | |
108 * they will not be executed by the browser. This is done by replacing the control | |
109 * characters with their escaped equivalents. | |
110 * See {@link hirondelle.web4j.security.SafeText} as well. | |
111 * | |
112 * <P>The following characters are replaced with corresponding | |
113 * HTML character entities : | |
114 * <table border='1' cellpadding='3' cellspacing='0'> | |
115 * <tr><th> Character </th><th>Replacement</th></tr> | |
116 * <tr><td> < </td><td> < </td></tr> | |
117 * <tr><td> > </td><td> > </td></tr> | |
118 * <tr><td> & </td><td> & </td></tr> | |
119 * <tr><td> " </td><td> "</td></tr> | |
120 * <tr><td> \t </td><td> 	</td></tr> | |
121 * <tr><td> ! </td><td> !</td></tr> | |
122 * <tr><td> # </td><td> #</td></tr> | |
123 * <tr><td> $ </td><td> $</td></tr> | |
124 * <tr><td> % </td><td> %</td></tr> | |
125 * <tr><td> ' </td><td> '</td></tr> | |
126 * <tr><td> ( </td><td> (</td></tr> | |
127 * <tr><td> ) </td><td> )</td></tr> | |
128 * <tr><td> * </td><td> *</td></tr> | |
129 * <tr><td> + </td><td> + </td></tr> | |
130 * <tr><td> , </td><td> , </td></tr> | |
131 * <tr><td> - </td><td> - </td></tr> | |
132 * <tr><td> . </td><td> . </td></tr> | |
133 * <tr><td> / </td><td> / </td></tr> | |
134 * <tr><td> : </td><td> :</td></tr> | |
135 * <tr><td> ; </td><td> ;</td></tr> | |
136 * <tr><td> = </td><td> =</td></tr> | |
137 * <tr><td> ? </td><td> ?</td></tr> | |
138 * <tr><td> @ </td><td> @</td></tr> | |
139 * <tr><td> [ </td><td> [</td></tr> | |
140 * <tr><td> \ </td><td> \</td></tr> | |
141 * <tr><td> ] </td><td> ]</td></tr> | |
142 * <tr><td> ^ </td><td> ^</td></tr> | |
143 * <tr><td> _ </td><td> _</td></tr> | |
144 * <tr><td> ` </td><td> `</td></tr> | |
145 * <tr><td> { </td><td> {</td></tr> | |
146 * <tr><td> | </td><td> |</td></tr> | |
147 * <tr><td> } </td><td> }</td></tr> | |
148 * <tr><td> ~ </td><td> ~</td></tr> | |
149 * </table> | |
150 * | |
151 * <P>Note that JSTL's {@code <c:out>} escapes <em>only the first | |
152 * five</em> of the above characters. | |
153 */ | |
154 public static String forHTML(String aText){ | |
155 final StringBuilder result = new StringBuilder(); | |
156 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
157 char character = iterator.current(); | |
158 while (character != CharacterIterator.DONE ){ | |
159 if (character == '<') { | |
160 result.append("<"); | |
161 } | |
162 else if (character == '>') { | |
163 result.append(">"); | |
164 } | |
165 else if (character == '&') { | |
166 result.append("&"); | |
167 } | |
168 else if (character == '\"') { | |
169 result.append("""); | |
170 } | |
171 else if (character == '\t') { | |
172 addCharEntity(9, result); | |
173 } | |
174 else if (character == '!') { | |
175 addCharEntity(33, result); | |
176 } | |
177 else if (character == '#') { | |
178 addCharEntity(35, result); | |
179 } | |
180 else if (character == '$') { | |
181 addCharEntity(36, result); | |
182 } | |
183 else if (character == '%') { | |
184 addCharEntity(37, result); | |
185 } | |
186 else if (character == '\'') { | |
187 addCharEntity(39, result); | |
188 } | |
189 else if (character == '(') { | |
190 addCharEntity(40, result); | |
191 } | |
192 else if (character == ')') { | |
193 addCharEntity(41, result); | |
194 } | |
195 else if (character == '*') { | |
196 addCharEntity(42, result); | |
197 } | |
198 else if (character == '+') { | |
199 addCharEntity(43, result); | |
200 } | |
201 else if (character == ',') { | |
202 addCharEntity(44, result); | |
203 } | |
204 else if (character == '-') { | |
205 addCharEntity(45, result); | |
206 } | |
207 else if (character == '.') { | |
208 addCharEntity(46, result); | |
209 } | |
210 else if (character == '/') { | |
211 addCharEntity(47, result); | |
212 } | |
213 else if (character == ':') { | |
214 addCharEntity(58, result); | |
215 } | |
216 else if (character == ';') { | |
217 addCharEntity(59, result); | |
218 } | |
219 else if (character == '=') { | |
220 addCharEntity(61, result); | |
221 } | |
222 else if (character == '?') { | |
223 addCharEntity(63, result); | |
224 } | |
225 else if (character == '@') { | |
226 addCharEntity(64, result); | |
227 } | |
228 else if (character == '[') { | |
229 addCharEntity(91, result); | |
230 } | |
231 else if (character == '\\') { | |
232 addCharEntity(92, result); | |
233 } | |
234 else if (character == ']') { | |
235 addCharEntity(93, result); | |
236 } | |
237 else if (character == '^') { | |
238 addCharEntity(94, result); | |
239 } | |
240 else if (character == '_') { | |
241 addCharEntity(95, result); | |
242 } | |
243 else if (character == '`') { | |
244 addCharEntity(96, result); | |
245 } | |
246 else if (character == '{') { | |
247 addCharEntity(123, result); | |
248 } | |
249 else if (character == '|') { | |
250 addCharEntity(124, result); | |
251 } | |
252 else if (character == '}') { | |
253 addCharEntity(125, result); | |
254 } | |
255 else if (character == '~') { | |
256 addCharEntity(126, result); | |
257 } | |
258 else { | |
259 //the char is not a special one | |
260 //add it to the result as is | |
261 result.append(character); | |
262 } | |
263 character = iterator.next(); | |
264 } | |
265 return result.toString(); | |
266 } | |
267 | |
268 | |
269 /** | |
270 * Escape all ampersand characters in a URL. | |
271 * | |
272 * <P>Replaces all <tt>'&'</tt> characters with <tt>'&'</tt>. | |
273 * | |
274 *<P>An ampersand character may appear in the query string of a URL. | |
275 * The ampersand character is indeed valid in a URL. | |
276 * <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and | |
277 * such attributes have the additional constraint that ampersands | |
278 * must be escaped.</em> | |
279 * | |
280 * <P>The JSTL <c:url> tag does indeed perform proper URL encoding of | |
281 * query parameters. But it does not, in general, produce text which | |
282 * is valid as an <tt>HREF</tt> attribute, simply because it does | |
283 * not escape the ampersand character. This is a nuisance when | |
284 * multiple query parameters appear in the URL, since it requires a little | |
285 * extra work. | |
286 */ | |
287 public static String forHrefAmpersand(String aURL){ | |
288 return aURL.replace("&", "&"); | |
289 } | |
290 | |
291 /** | |
292 * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>. | |
293 * | |
294 * <P>Used to ensure that HTTP query strings are in proper form, by escaping | |
295 * special characters such as spaces. | |
296 * | |
297 * <P>It is important to note that if a query string appears in an <tt>HREF</tt> | |
298 * attribute, then there are two issues - ensuring the query string is valid HTTP | |
299 * (it is URL-encoded), and ensuring it is valid HTML (ensuring the | |
300 * ampersand is escaped). | |
301 */ | |
302 public static String forURL(String aURLFragment){ | |
303 String result = null; | |
304 try { | |
305 result = URLEncoder.encode(aURLFragment, "UTF-8"); | |
306 } | |
307 catch (UnsupportedEncodingException ex){ | |
308 throw new RuntimeException("UTF-8 not supported", ex); | |
309 } | |
310 return result; | |
311 } | |
312 | |
313 /** | |
314 * Escape characters for text appearing as XML data, between tags. | |
315 * | |
316 * <P>The following characters are replaced with corresponding character entities : | |
317 * <table border='1' cellpadding='3' cellspacing='0'> | |
318 * <tr><th> Character </th><th> Encoding </th></tr> | |
319 * <tr><td> < </td><td> < </td></tr> | |
320 * <tr><td> > </td><td> > </td></tr> | |
321 * <tr><td> & </td><td> & </td></tr> | |
322 * <tr><td> " </td><td> "</td></tr> | |
323 * <tr><td> ' </td><td> '</td></tr> | |
324 * </table> | |
325 * | |
326 * <P>Note that JSTL's {@code <c:out>} escapes the exact same set of | |
327 * characters as this method. <span class='highlight'>That is, {@code <c:out>} | |
328 * is good for escaping to produce valid XML, but not for producing safe | |
329 * HTML.</span> | |
330 */ | |
331 public static String forXML(String aText){ | |
332 final StringBuilder result = new StringBuilder(); | |
333 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
334 char character = iterator.current(); | |
335 while (character != CharacterIterator.DONE ){ | |
336 if (character == '<') { | |
337 result.append("<"); | |
338 } else if (character == '>') { | |
339 result.append(">"); | |
340 } else if (character == '\"') { | |
341 result.append("""); | |
342 } else if (character == '\'') { | |
343 result.append("'"); | |
344 } else if (character == '&') { | |
345 result.append("&"); | |
346 } else { | |
347 // the char is not a special one: add it to the result as is | |
348 result.append(character); | |
349 } | |
350 character = iterator.next(); | |
351 } | |
352 return result.toString(); | |
353 } | |
354 | |
355 /** | |
356 * Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters | |
357 * replaced by their escaped equivalents. | |
358 */ | |
359 public static String toDisableTags(String aText){ | |
360 final StringBuilder result = new StringBuilder(); | |
361 final StringCharacterIterator iterator = new StringCharacterIterator(aText); | |
362 char character = iterator.current(); | |
363 while (character != CharacterIterator.DONE ){ | |
364 if (character == '<') { | |
365 result.append("<"); | |
366 } | |
367 else if (character == '>') { | |
368 result.append(">"); | |
369 } | |
370 else { | |
371 //the char is not a special one | |
372 //add it to the result as is | |
373 result.append(character); | |
374 } | |
375 character = iterator.next(); | |
376 } | |
377 return result.toString(); | |
378 } | |
379 | |
380 | |
381 /** | |
382 * Replace characters having special meaning in regular expressions | |
383 * with their escaped equivalents, preceded by a '\' character. | |
384 * | |
385 * <P>The escaped characters include : | |
386 *<ul> | |
387 *<li>. | |
388 *<li>\ | |
389 *<li>?, * , and + | |
390 *<li>& | |
391 *<li>: | |
392 *<li>{ and } | |
393 *<li>[ and ] | |
394 *<li>( and ) | |
395 *<li>^ and $ | |
396 *</ul> | |
397 */ | |
398 public static String forRegex(String aRegexFragment){ | |
399 final StringBuilder result = new StringBuilder(); | |
400 | |
401 final StringCharacterIterator iterator = | |
402 new StringCharacterIterator(aRegexFragment) | |
403 ; | |
404 char character = iterator.current(); | |
405 while (character != CharacterIterator.DONE ){ | |
406 /* | |
407 * All literals need to have backslashes doubled. | |
408 */ | |
409 if (character == '.') { | |
410 result.append("\\."); | |
411 } | |
412 else if (character == '\\') { | |
413 result.append("\\\\"); | |
414 } | |
415 else if (character == '?') { | |
416 result.append("\\?"); | |
417 } | |
418 else if (character == '*') { | |
419 result.append("\\*"); | |
420 } | |
421 else if (character == '+') { | |
422 result.append("\\+"); | |
423 } | |
424 else if (character == '&') { | |
425 result.append("\\&"); | |
426 } | |
427 else if (character == ':') { | |
428 result.append("\\:"); | |
429 } | |
430 else if (character == '{') { | |
431 result.append("\\{"); | |
432 } | |
433 else if (character == '}') { | |
434 result.append("\\}"); | |
435 } | |
436 else if (character == '[') { | |
437 result.append("\\["); | |
438 } | |
439 else if (character == ']') { | |
440 result.append("\\]"); | |
441 } | |
442 else if (character == '(') { | |
443 result.append("\\("); | |
444 } | |
445 else if (character == ')') { | |
446 result.append("\\)"); | |
447 } | |
448 else if (character == '^') { | |
449 result.append("\\^"); | |
450 } | |
451 else if (character == '$') { | |
452 result.append("\\$"); | |
453 } | |
454 else { | |
455 //the char is not a special one | |
456 //add it to the result as is | |
457 result.append(character); | |
458 } | |
459 character = iterator.next(); | |
460 } | |
461 return result.toString(); | |
462 } | |
463 | |
464 /** | |
465 * Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings. | |
466 * | |
467 * <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>. | |
468 * | |
469 * <P>The following methods use replacement strings which treat | |
470 * <tt>'$'</tt> and <tt>'\'</tt> as special characters: | |
471 * <ul> | |
472 * <li><tt>String.replaceAll(String, String)</tt> | |
473 * <li><tt>String.replaceFirst(String, String)</tt> | |
474 * <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt> | |
475 * </ul> | |
476 * | |
477 * <P>If replacement text can contain arbitrary characters, then you | |
478 * will usually need to escape that text, to ensure special characters | |
479 * are interpreted literally. | |
480 */ | |
481 public static String forReplacementString(String aInput){ | |
482 return Matcher.quoteReplacement(aInput); | |
483 } | |
484 | |
485 /** | |
486 * Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>. | |
487 * | |
488 * <P>Insensitive to case. | |
489 */ | |
490 public static String forScriptTagsOnly(String aText){ | |
491 String result = null; | |
492 Matcher matcher = SCRIPT.matcher(aText); | |
493 result = matcher.replaceAll("<SCRIPT>"); | |
494 matcher = SCRIPT_END.matcher(result); | |
495 result = matcher.replaceAll("</SCRIPT>"); | |
496 return result; | |
497 } | |
498 | |
499 // PRIVATE // | |
500 | |
501 private StringUtils(){ | |
502 //empty - prevent construction | |
503 } | |
504 | |
505 private static final Pattern SCRIPT = Pattern.compile( | |
506 "<SCRIPT>", Pattern.CASE_INSENSITIVE | |
507 ); | |
508 private static final Pattern SCRIPT_END = Pattern.compile( | |
509 "</SCRIPT>", Pattern.CASE_INSENSITIVE | |
510 ); | |
511 | |
512 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ | |
513 String padding = ""; | |
514 if( aIdx <= 9 ){ | |
515 padding = "00"; | |
516 } | |
517 else if( aIdx <= 99 ){ | |
518 padding = "0"; | |
519 } | |
520 else { | |
521 //no prefix | |
522 } | |
523 String number = padding + aIdx.toString(); | |
524 aBuilder.append("&#" + number + ";"); | |
525 } | |
526 } |