Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 11 Mar 2011 13:33:26 +0100 |
parents | 2396a569e446 |
children | fba5577e49d9 |
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Fri Mar 11 13:33:26 2011 +0100 @@ -45,6 +45,8 @@ System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { + // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters + charactersStr = zwsp(charactersStr); if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); @@ -151,6 +153,20 @@ outputXmlFragment += outStr; } + /** + * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) + * @param str + * @return + */ + private String zwsp(String str) { + // based on Unicode 3.2 + String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; + String regex = "(" + ideographic + ")(" + ideographic + ")"; + String retStr = str.replaceAll(regex, "$1\u200b$2"); + retStr = retStr.replaceAll(regex, "$1\u200b$2"); + return retStr; + } + private class Element { private int type; private String name;