diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 10:59ff47d1e237

TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Fri, 11 Mar 2011 13:33:26 +0100
parents 2396a569e446
children fba5577e49d9
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Tue Feb 22 16:03:45 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Fri Mar 11 13:33:26 2011 +0100
@@ -45,6 +45,8 @@
     System.arraycopy(c, start, cCopy, 0, length);
     String charactersStr = String.valueOf(cCopy);
     if (charactersStr != null && ! charactersStr.equals("")) {
+      // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters
+      charactersStr = zwsp(charactersStr);
       if (currentElement != null) {
         Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
         charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
@@ -151,6 +153,20 @@
     outputXmlFragment += outStr;
   }
   
+  /**
+   * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs)
+   * @param str
+   * @return
+   */
+  private String zwsp(String str) {
+    // based on Unicode 3.2
+    String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]";
+    String regex = "(" + ideographic + ")(" + ideographic + ")";
+    String retStr = str.replaceAll(regex, "$1\u200b$2");
+    retStr = retStr.replaceAll(regex, "$1\u200b$2");
+    return retStr;
+  }
+  
   private class Element {
     private int type;
     private String name;