comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 10:59ff47d1e237

TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Fri, 11 Mar 2011 13:33:26 +0100
parents 2396a569e446
children fba5577e49d9
comparison
equal deleted inserted replaced
9:1ec29fdd0db8 10:59ff47d1e237
43 public void characters(char[] c, int start, int length) throws SAXException { 43 public void characters(char[] c, int start, int length) throws SAXException {
44 char[] cCopy = new char[length]; 44 char[] cCopy = new char[length];
45 System.arraycopy(c, start, cCopy, 0, length); 45 System.arraycopy(c, start, cCopy, 0, length);
46 String charactersStr = String.valueOf(cCopy); 46 String charactersStr = String.valueOf(cCopy);
47 if (charactersStr != null && ! charactersStr.equals("")) { 47 if (charactersStr != null && ! charactersStr.equals("")) {
48 // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters
49 charactersStr = zwsp(charactersStr);
48 if (currentElement != null) { 50 if (currentElement != null) {
49 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); 51 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
50 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); 52 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
51 if (currentElement.composites == null) 53 if (currentElement.composites == null)
52 currentElement.composites = new ArrayList<Element>(); 54 currentElement.composites = new ArrayList<Element>();
147 return counter + 1; 149 return counter + 1;
148 } 150 }
149 151
150 private void write(String outStr) throws SAXException { 152 private void write(String outStr) throws SAXException {
151 outputXmlFragment += outStr; 153 outputXmlFragment += outStr;
154 }
155
156 /**
157 * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs)
158 * @param str
159 * @return
160 */
161 private String zwsp(String str) {
162 // based on Unicode 3.2
163 String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]";
164 String regex = "(" + ideographic + ")(" + ideographic + ")";
165 String retStr = str.replaceAll(regex, "$1\u200b$2");
166 retStr = retStr.replaceAll(regex, "$1\u200b$2");
167 return retStr;
152 } 168 }
153 169
154 private class Element { 170 private class Element {
155 private int type; 171 private int type;
156 private String name; 172 private String name;