Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 11 Mar 2011 13:33:26 +0100 |
parents | 2396a569e446 |
children | fba5577e49d9 |
comparison
equal
deleted
inserted
replaced
9:1ec29fdd0db8 | 10:59ff47d1e237 |
---|---|
43 public void characters(char[] c, int start, int length) throws SAXException { | 43 public void characters(char[] c, int start, int length) throws SAXException { |
44 char[] cCopy = new char[length]; | 44 char[] cCopy = new char[length]; |
45 System.arraycopy(c, start, cCopy, 0, length); | 45 System.arraycopy(c, start, cCopy, 0, length); |
46 String charactersStr = String.valueOf(cCopy); | 46 String charactersStr = String.valueOf(cCopy); |
47 if (charactersStr != null && ! charactersStr.equals("")) { | 47 if (charactersStr != null && ! charactersStr.equals("")) { |
48 // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters | |
49 charactersStr = zwsp(charactersStr); | |
48 if (currentElement != null) { | 50 if (currentElement != null) { |
49 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); | 51 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); |
50 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); | 52 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); |
51 if (currentElement.composites == null) | 53 if (currentElement.composites == null) |
52 currentElement.composites = new ArrayList<Element>(); | 54 currentElement.composites = new ArrayList<Element>(); |
147 return counter + 1; | 149 return counter + 1; |
148 } | 150 } |
149 | 151 |
150 private void write(String outStr) throws SAXException { | 152 private void write(String outStr) throws SAXException { |
151 outputXmlFragment += outStr; | 153 outputXmlFragment += outStr; |
154 } | |
155 | |
156 /** | |
157 * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) | |
158 * @param str | |
159 * @return | |
160 */ | |
161 private String zwsp(String str) { | |
162 // based on Unicode 3.2 | |
163 String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; | |
164 String regex = "(" + ideographic + ")(" + ideographic + ")"; | |
165 String retStr = str.replaceAll(regex, "$1\u200b$2"); | |
166 retStr = retStr.replaceAll(regex, "$1\u200b$2"); | |
167 return retStr; | |
152 } | 168 } |
153 | 169 |
154 private class Element { | 170 private class Element { |
155 private int type; | 171 private int type; |
156 private String name; | 172 private String name; |