changeset 12:fba5577e49d9

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 19 Apr 2011 16:51:26 +0200
parents d6f528ad5d96
children 469d927b9ca7
files software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java
diffstat 8 files changed, 87 insertions(+), 84 deletions(-) [+]
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Tue Apr 19 16:51:26 2011 +0200
@@ -7,10 +7,8 @@
 
 public class ExtElement extends ExtObject {
   private String pageNumber;
-  private String xmlNodeId;
-  private String before = "false";
-  private String charPos;
-  private String xpath;
+  private String xpath;  // path to element
+  private String point;  // ".0", ".1" or a positive integer
 
   public ExtElement() {
     this.type = "element";  
@@ -22,11 +20,19 @@
     String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null);
     String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null);
     String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null);
-    String pageNumber = xmlUtil.evaluateToString(xmlStr, "/object/@pageNumber", null);
-    String xmlNodeId = xmlUtil.evaluateToString(xmlStr, "/object/@xmlNodeId", null);
-    String before = xmlUtil.evaluateToString(xmlStr, "/object/@before", null);
-    String charPos = xmlUtil.evaluateToString(xmlStr, "/object/@charPos", null);
-    String xpath = xmlUtil.evaluateToString(xmlStr, "/object/@xpath", null);
+    String xpointer = xmlUtil.evaluateToString(xmlStr, "/object/@xpointer", null);
+    String pageNumber = null;
+    String xpath = null;
+    String point = null; 
+    if (xpointer != null) {
+      pageNumber = xpointer.replaceAll("#xpointer\\(id\\('page(.+)?'\\).*", "$1");
+      if (xpointer.contains("point(")) {
+        xpath = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\)(.*)?/point\\(.+?\\)\\)", "$1");
+        point = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\).*?/point\\((.+)?\\)\\)", "$1");
+      } else {
+        xpath = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\)(.*)?.*?\\)", "$1");
+      }
+    }
     String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
     Date modDate = xmlUtil.toDate(dateStr);
     ExtElement e = new ExtElement();
@@ -34,10 +40,8 @@
     e.setModificationDate(modDate);
     e.setDocumentId(docId);
     e.setPageNumber(pageNumber);
-    e.setXmlNodeId(xmlNodeId);
     e.setXpath(xpath);
-    e.setBefore(before);
-    e.setCharPos(charPos);
+    e.setPoint(point);
     e.setContent(content);
     return e;
   }
@@ -58,18 +62,17 @@
       xmlString = xmlString + " uid=\"" + uid + "\"";
     if (documentId != null)
       xmlString = xmlString + " documentId=\"" + documentId + "\"";
+    if (xpath != null)
+      xmlString = xmlString + " xmlNodeId=\"" + xpath + "\"";
     if (pageNumber != null)
-      xmlString = xmlString + " pageNumber=\"" + pageNumber + "\"";
-    if (xmlNodeId != null)
-      xmlString = xmlString + " xmlNodeId=\"" + xmlNodeId + "\"";
-    if (before != null)
-      xmlString = xmlString + " before=\"" + before + "\"";
-    if (charPos != null)
-      xmlString = xmlString + " charPos=\"" + charPos + "\"";
+      xmlString = xmlString + " xpointer=\"#xpointer(id('page" + pageNumber + "')";
     if (xpath != null)
-      xmlString = xmlString + " xpath=\"" + xpath + "\"";
-    xmlString = xmlString + ">";
+      xmlString = xmlString + xpath;
+    if (point != null)
+      xmlString = xmlString + "/point(" + point + ")";
+    xmlString = xmlString + ")\">";
     if (content != null) {
+      // TODO wieder ausbauen
       // write the uid and modificationDate into the content node
       if (! content.contains("uid")) {
         int firstClose = content.indexOf(">");
@@ -90,20 +93,12 @@
     this.xpath = xpath;
   }
 
-  public String getXmlNodeId() {
-    return xmlNodeId;
+  public String getPoint() {
+    return point;
   }
 
-  public void setXmlNodeId(String xmlNodeId) {
-    this.xmlNodeId = xmlNodeId;
-  }
-
-  public String getCharPos() {
-    return charPos;
-  }
-
-  public void setCharPos(String charPos) {
-    this.charPos = charPos;
+  public void setPoint(String point) {
+    this.point = point;
   }
 
   public String getPageNumber() {
@@ -114,19 +109,4 @@
     this.pageNumber = pageNumber;
   }
 
-  public boolean isBefore() {
-    if (before != null && before.equals("true"))
-      return true;
-    else 
-      return false;
-  }
-  
-  public String getBefore() {
-    return before;
-  }
-
-  public void setBefore(String before) {
-    this.before = before;
-  }
-
 }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Tue Apr 19 16:51:26 2011 +0200
@@ -102,7 +102,7 @@
     String docId = element.getDocumentId();
     String pageNumber = element.getPageNumber();
     String uid = element.getUid();
-    String xmlNodeId = element.getXmlNodeId();
+    String xpath = element.getXpath();
     String hashKey = docId + "###" + pageNumber;
     boolean updated = false;
     try {
@@ -118,8 +118,8 @@
         String foundValueStr = new String(foundValueBytes, "utf-8");
         ExtElement elem = ExtElement.parseXmlStr(foundValueStr);
         String elemUid = elem.getUid();
-        String elemXmlNodeId = elem.getXmlNodeId();
-        if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) {
+        String elemXPath = elem.getXpath();
+        if (uid.equals(elemUid) && xpath.equals(elemXPath)) {
           cursor.delete();
           byte[] elementXmlStrBytes = elementXmlStr.getBytes("utf-8");
           DatabaseEntry dbEntryValue = new DatabaseEntry(elementXmlStrBytes);
@@ -143,7 +143,7 @@
     String docId = element.getDocumentId();
     String pageNumber = element.getPageNumber();
     String uid = element.getUid();
-    String xmlNodeId = element.getXmlNodeId();
+    String xpath = element.getXpath();
     String hashKey = docId + "###" + pageNumber;
     try {
       Database elementDB = dbEnvExternalObjects.getElementDB();
@@ -158,8 +158,8 @@
         String foundValueStr = new String(foundValueBytes, "utf-8");
         ExtElement elem = ExtElement.parseXmlStr(foundValueStr);
         String elemUid = elem.getUid();
-        String elemXmlNodeId = elem.getXmlNodeId();
-        if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) {
+        String elemXPath = elem.getXpath();
+        if (uid.equals(elemUid) && xpath.equals(elemXPath)) {
           cursor.delete();
         }
         operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
@@ -204,15 +204,15 @@
   private void test(ExtElement element) throws ApplicationException {
     String uid = element.getUid();
     String docId = element.getDocumentId();
-    String xmlNodeId = element.getXmlNodeId();
+    String xpath = element.getXpath();
     String pageNumber = element.getPageNumber();
     String xmlStr = element.getXmlString();
     if (uid == null)
       throw new ApplicationException("External object: no attribute \"uid\" specified in: " + xmlStr);
     if (docId == null)
       throw new ApplicationException("External object: no attribute \"documentId\" specified in: " + xmlStr);
-    if (xmlNodeId == null)
-      throw new ApplicationException("External object: no attribute \"xmlNodeId\" specified in: " + xmlStr);
+    if (xpath == null)
+      throw new ApplicationException("External object: no xpath in attribute \"xpointer\" specified in: " + xmlStr);
     if (pageNumber == null)
       throw new ApplicationException("External object: no attribute \"pageNumber\" specified in: " + xmlStr);
   }
@@ -396,8 +396,7 @@
       "<object type=\"" + "element" + "\" " + 
               "uid=\"" + "joe" + "\" " + 
               "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "pageNumber=\"" + "17" + "\" " + 
-              "xmlNodeId=\"" + xmlNodeId1 + "\"" + 
+              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId1 + "\"" + 
               ">" +
        "</object>";
     ExtElement e1 = ExtElement.parseXmlStr(objectXmlStr1);
@@ -406,8 +405,7 @@
       "<object type=\"" + "element" + "\" " + 
               "uid=\"" + "michael" + "\" " + 
               "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "pageNumber=\"" + "17" + "\" " + 
-              "xmlNodeId=\"" + xmlNodeId2 + "\"" + 
+              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId2 + "\"" + 
               ">" +
        "</object>";
     ExtElement e2 = ExtElement.parseXmlStr(objectXmlStr2);
@@ -442,7 +440,8 @@
     e.setModificationDate(now);
     e.setDocumentId("/archimedes/it/l223.xml");
     e.setPageNumber("17");
-    e.setXmlNodeId(sId);
+    e.setXpath(sId);
+    e.setPoint(".1");
     e.setContent("<note>This is a test note to element " + sId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>");
     createExternalElement(e);
     
@@ -452,8 +451,8 @@
     e2.setModificationDate(now);
     e2.setDocumentId("/archimedes/it/l223.xml");
     e2.setPageNumber("17");
-    e2.setXmlNodeId(sId2);
-    e2.setCharPos("18");
+    e2.setXpath(sId2);
+    e2.setPoint("18");
     e2.setContent("<note>This is a test note to element " + sId2 + "</note>");
     createExternalElement(e2);
     
@@ -482,8 +481,7 @@
       "<object type=\"" + "element" + "\" " + 
               "uid=\"" + "joe" + "\" " + 
               "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "pageNumber=\"" + "17" + "\" " + 
-              "xmlNodeId=\"" + xmlNodeId + "\"" + 
+              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId + "\"" + 
               ">" +
           "<content>" + "<note>This is a test note to element " + xmlNodeId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>" + "</content>" +
        "</object>";
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Tue Apr 19 16:51:26 2011 +0200
@@ -45,8 +45,6 @@
     System.arraycopy(c, start, cCopy, 0, length);
     String charactersStr = String.valueOf(cCopy);
     if (charactersStr != null && ! charactersStr.equals("")) {
-      // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters
-      charactersStr = zwsp(charactersStr);
       if (currentElement != null) {
         Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
         charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
@@ -68,6 +66,8 @@
 
   public void startPrefixMapping(String prefix, String uri) throws SAXException {
     xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
+    if (prefix != null && prefix.equals(""))  
+      xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
   }
   
   public void endPrefixMapping(String prefix) throws SAXException {
@@ -153,20 +153,6 @@
     outputXmlFragment += outStr;
   }
   
-  /**
-   * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs)
-   * @param str
-   * @return
-   */
-  private String zwsp(String str) {
-    // based on Unicode 3.2
-    String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]";
-    String regex = "(" + ideographic + ")(" + ideographic + ")";
-    String retStr = str.replaceAll(regex, "$1\u200b$2");
-    retStr = retStr.replaceAll(regex, "$1\u200b$2");
-    return retStr;
-  }
-  
   private class Element {
     private int type;
     private String name;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Apr 19 16:51:26 2011 +0200
@@ -58,6 +58,8 @@
 
   public void startPrefixMapping(String prefix, String uri) throws SAXException {
     xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
+    if (prefix != null && prefix.equals(""))  
+      xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
   }
   
   public void endPrefixMapping(String prefix) throws SAXException {
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java	Tue Apr 19 16:51:26 2011 +0200
@@ -46,6 +46,11 @@
         throw new ApplicationException(e);
       }
     }
+    // replace "small letter sigma" at the end of a word by the "small letter end sigma"
+    if (retStr != null && retStr.contains("σ")) {
+      retStr = retStr.replaceAll("(.*)σ(\\s)", "$1ς$2");
+      retStr = retStr.replaceAll("(.*)σ($)", "$1ς$2");
+    }
     return retStr;
     /* 
     // alternative to JFlex 
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/Converter.java	Tue Apr 19 16:51:26 2011 +0200
@@ -110,12 +110,10 @@
     String outputFileNameDonatusFrenchSup = OUT_DATA_DIR + "/" + "donatus-sup-fr-forms.xml";
     instance.donatusSupplementsConvert("donatus-sup", "fr", inputFileNameDonatusFrenchSup, outputFileNameDonatusFrenchSup);
     instance.forms = new Hashtable<String, Hashtable<String, Form>>();
-    */
     // Italian
     String inputFileNameItalian = ORIG_ITALIAN_DATA_DIR + "/" + "ital.hash";
     String outputFileNameItalian = OUT_DATA_DIR + "/" + "donatus-italian-forms.xml";
     instance.donatusItalianConvert("donatus", "it", inputFileNameItalian, outputFileNameItalian);
-    /*
     String inputFileNameDonatusItalianSup = ORIG_DONATUS_SUB_DATA_DIR + "/" + "donatus-sup-it-forms.csv";
     String outputFileNameDonatusItalianSup = OUT_DATA_DIR + "/" + "donatus-sup-it-forms.xml";
     instance.donatusSupplementsConvert("donatus-sup", "it", inputFileNameDonatusItalianSup, outputFileNameDonatusItalianSup);
@@ -461,6 +459,15 @@
     Transcoder transcoder = Transcoder.getInstance();
     String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
     String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
+    // replace "small letter sigma" at the end of a word by the "small letter end sigma"
+    if (encodedUnicodeForm != null && encodedUnicodeForm.endsWith("σ")) {
+      int length = encodedUnicodeForm.length();
+      encodedUnicodeForm = encodedUnicodeForm.substring(0, length - 1) + "ς";
+    }
+    if (encodedUnicodeLemma != null && encodedUnicodeLemma.endsWith("σ")) {
+      int length = encodedUnicodeLemma.length();
+      encodedUnicodeLemma = encodedUnicodeLemma.substring(0, length - 1) + "ς";
+    }
     form.setFormName(encodedUnicodeForm);
     form.setLemmaName(encodedUnicodeLemma);
     return form;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/converter/PerseusContentHandler.java	Tue Apr 19 16:51:26 2011 +0200
@@ -188,6 +188,15 @@
     Transcoder transcoder = Transcoder.getInstance();
     String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(formName);
     String encodedUnicodeLemma = transcoder.transcodeFromBetaCode2Unicode(lemmaName);
+    // replace "small letter sigma" at the end of a word by the "small letter end sigma"
+    if (encodedUnicodeForm != null && encodedUnicodeForm.endsWith("σ")) {
+      int length = encodedUnicodeForm.length();
+      encodedUnicodeForm = encodedUnicodeForm.substring(0, length - 1) + "ς";
+    }
+    if (encodedUnicodeLemma != null && encodedUnicodeLemma.endsWith("σ")) {
+      int length = encodedUnicodeLemma.length();
+      encodedUnicodeLemma = encodedUnicodeLemma.substring(0, length - 1) + "ς";
+    }
     form.setFormName(encodedUnicodeForm);
     form.setLemmaName(encodedUnicodeLemma);
     return form;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java	Fri Mar 11 13:34:02 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/StringUtilEscapeChars.java	Tue Apr 19 16:51:26 2011 +0200
@@ -8,6 +8,22 @@
 import java.util.regex.Pattern;
 
 public class StringUtilEscapeChars {
+
+  /**
+   * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs)
+   * @param str
+   * @return
+   */
+  public static String zwsp(String str) {
+    // based on Unicode 3.2
+    String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]";
+    String regex = "(" + ideographic + ")(" + ideographic + ")";
+    String retStr = str.replaceAll(regex, "$1\u200b$2");
+    retStr = retStr.replaceAll(regex, "$1\u200b$2");
+    return retStr;
+  }
+  
+  
   public static String deleteSpecialXmlEntities(String inputStr) {
     inputStr = inputStr.replaceAll("&lt;", "");
     inputStr = inputStr.replaceAll("&gt;", "");