diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 59ff47d1e237
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,368 @@
+package de.mpg.mpiwg.berlin.mpdl.xml;
+
+import java.io.File;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+
+import javax.xml.namespace.NamespaceContext;
+
+import org.w3c.dom.Node;
+
+import de.mpg.mpiwg.berlin.mpdl.escidoc.MetadataRecord;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
+
+public class SchemaHandler {
+
+  /**
+   * 
+   * @param fileName
+   * @param docOperation
+   * @return doc root node of xml file
+   * @throws ApplicationException
+   */
+  public Node validate(String fileName, MpdlDocOperation docOperation) throws ApplicationException {
+    File destFile = new File(fileName);
+    String docBase = docOperation.getDocBase();
+    // file name validation
+    String fName = docOperation.getFileName();
+    if (fName == null || fName.trim().equals(""))
+      throw new ApplicationException("Your document file name is empty. Please specify a file name for your document.");
+    if (! fName.endsWith(".xml"))
+      throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document.");
+    // RelaxNG schema validation
+    validateByRelaxNGSchema(destFile, docBase);
+    // parse validation
+    Node docNode = parse(destFile);
+    // first simple validations
+    validate(docNode, docBase);
+    // validate metadata
+    MetadataRecord mdRecord = getMetadataRecord(docNode, docOperation);
+    validate(mdRecord);
+    docOperation.setMdRecord(mdRecord);
+    return docNode;
+  }
+  
+  public MetadataRecord getMetadataRecord(Node documentNode, MpdlDocOperation docOperation) throws ApplicationException {
+    MetadataRecord mdRecord = null;
+    String eXistIdentifier = docOperation.getDestUrl(); 
+    String docBase = docOperation.getDocBase();
+    if (docBase != null && docBase.equals("echo")) {
+      mdRecord = getMetadataRecordEcho(documentNode);
+      if (mdRecord != null) {
+        String identifier = mdRecord.getIdentifier();
+        if (identifier == null) {
+          String id = getIdByExistId(eXistIdentifier);
+          mdRecord.setIdentifier("ECHO:" + id + ".xml");
+        }
+      }
+    } else if (docBase != null && docBase.equals("archimedes")) {
+      mdRecord = getMetadataRecordArchimedes(documentNode);
+      if (mdRecord != null) {
+        String id = getIdByExistId(eXistIdentifier);
+        mdRecord.setIdentifier("ARCHIMEDES:" + id + ".xml");
+      }
+    }
+    if (mdRecord != null) {
+      mdRecord.setEXistIdentifier(eXistIdentifier);
+      mdRecord.setMediaType("fulltext");
+    }
+    return mdRecord;
+  }
+  
+  public ArrayList<String> getPBFileNames(Node documentNode, String docBase) throws ApplicationException {
+    ArrayList<String> pbFileNamesArrayStr = null;
+    if (docBase != null && docBase.equals("echo")) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      NamespaceContext nsContext = getEchoNsContext();
+      pbFileNamesArrayStr = xmlUtil.evaluateToStringArray(documentNode, "//echo:pb/@file", nsContext);
+    } else if (docBase != null && docBase.equals("archimedes")) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      ArrayList<String> pbsStrArray = xmlUtil.evaluateToStringArray(documentNode, "//pb", null);
+      if (pbsStrArray != null) {
+        pbFileNamesArrayStr = new ArrayList<String>();
+        int countPBs = pbsStrArray.size();
+        for (int i=1; i<=countPBs; i++) {
+          pbFileNamesArrayStr.add("" + i); // empty names for each page break
+        }
+      }
+    }
+    return pbFileNamesArrayStr;
+  }
+  
+  public String getPageImgDir(MetadataRecord mdRecord) throws ApplicationException {
+    String dcId = mdRecord.getIdentifier();  // dublin core identifier: is used to find the digilib image directory for this document
+    String id = getIdByDCIdentifier(dcId);
+    String imagesDocDirectory = "/permanent/library/" + id;
+    if (mdRecord.hasArchimedesDocBase())
+      imagesDocDirectory = "/permanent/archimedes/" + id;
+    String echoDir = mdRecord.getEchoDir();
+    if (echoDir != null)
+      imagesDocDirectory = echoDir;
+    String pageImgSubDir =  "pageimg"; // default name: if digilib does not answer then this name is used
+    String indexMetaPageImgDir = getIndexMetaDataPageImg(imagesDocDirectory);
+    if (indexMetaPageImgDir != null)
+      pageImgSubDir = indexMetaPageImgDir;
+    String pageImgDir = imagesDocDirectory + "/" + pageImgSubDir;
+    return pageImgDir;
+  }
+  
+  private Node parse(File file) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    Node retNode = null;
+    try {
+     retNode = xmlUtil.doc(file);
+    } catch (ApplicationException e) {
+      throw new ApplicationException("Your source file is not valid: " + e.getMessage());
+    }
+    return retNode;
+  }
+  
+  private void validate(Node docNode, String docBase) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    NamespaceContext nsContext = getEchoNsContext();
+    String echoTest = null;
+    String archimedesTest = null;
+    try {
+      echoTest = xmlUtil.evaluateToString(docNode, "/echo:echo/echo:metadata", nsContext);
+      archimedesTest = xmlUtil.evaluateToString(docNode, "/archimedes/info", null);
+    } catch (ApplicationException e) {
+      throw new ApplicationException("Your source file is not an \"echo\" or \"archimedes\" file. Please proof that file.");
+    }
+    if (docBase.equals("echo") && archimedesTest != null)
+      throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base.");
+    if (docBase.equals("archimedes") && echoTest != null)
+      throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base.");
+  }
+  
+  private void validateByRelaxNGSchema(File destFile, String docBase) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    if (docBase.equals("echo")) {
+      URL echoSchemaUrl = getEchoRelaxNGSchemaUrl();
+      xmlUtil.validateByRelaxNG(destFile, echoSchemaUrl);
+    }
+  }
+  
+  private URL getEchoRelaxNGSchemaUrl() throws ApplicationException {
+    String echoSchemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_ECHO_RELAXNG_PATH;
+    URL echoSchemaUrl = null;
+    try {
+      echoSchemaUrl = new URL(echoSchemaUrlStr);
+    } catch (MalformedURLException e) {
+      throw new ApplicationException(e);
+    }
+    return echoSchemaUrl;    
+  }
+
+  private void validate(MetadataRecord mdRecord) throws ApplicationException {
+    String identifier = mdRecord.getIdentifier();
+    String creator = mdRecord.getCreator();
+    String title = mdRecord.getTitle();
+    if (identifier == null || identifier.trim().equals(""))
+      throw new ApplicationException("Your document file does not contain the metadata field: " + "identifier");
+    if (creator == null || creator.trim().equals(""))
+      throw new ApplicationException("Your document file does not contain the metadata field: " + "creator");
+    if (title == null || title.trim().equals(""))
+      throw new ApplicationException("Your document file does not contain the metadata field: " + "title");
+  }
+  
+  private MetadataRecord getMetadataRecordEcho(Node documentNode) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    NamespaceContext nsContext = getEchoNsContext();
+    String identifier = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:identifier", nsContext);
+    if (identifier != null)
+      identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier);
+    String creator = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:creator", nsContext);
+    if (creator != null)
+      creator = StringUtilEscapeChars.deresolveXmlEntities(creator);
+    String title = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:title", nsContext);
+    if (title != null)
+      title = StringUtilEscapeChars.deresolveXmlEntities(title);
+    String language = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:language", nsContext);
+    if (language != null)
+      language = StringUtilEscapeChars.deresolveXmlEntities(language);
+    String yearStr = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:date", nsContext);
+    Date date = null; 
+    if (yearStr != null && ! yearStr.equals("")) {
+      yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr);
+      yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+      if (yearStr != null)
+        date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z");
+    }
+    String rights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:rights", nsContext);
+    if (rights != null)
+      rights = StringUtilEscapeChars.deresolveXmlEntities(rights);
+    String license = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:license", nsContext);
+    if (license != null)
+      license = StringUtilEscapeChars.deresolveXmlEntities(license);
+    String accessRights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:accessRights", nsContext);
+    if (accessRights != null)
+      accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights);
+    String echoDir = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echodir", nsContext);
+    if (echoDir != null)
+      echoDir = StringUtilEscapeChars.deresolveXmlEntities(echoDir);
+    String echoLink = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echolink", nsContext);
+    if (echoLink != null)
+      echoLink = StringUtilEscapeChars.deresolveXmlEntities(echoLink);
+    MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date);
+    mdRecord.setDocBase("echo");
+    mdRecord.setLicense(license);
+    mdRecord.setAccessRights(accessRights);
+    mdRecord.setEchoLink(echoLink);
+    mdRecord.setEchoDir(echoDir);
+    return mdRecord;
+  }
+
+  private MetadataRecord getMetadataRecordArchimedes(Node documentNode) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    String identifier = xmlUtil.evaluateToString(documentNode, "/archimedes/info/cvs_file", null);
+    if (identifier != null)
+      identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier);
+    String creator = xmlUtil.evaluateToString(documentNode, "/archimedes/info/author", null);
+    if (creator != null)
+      creator = StringUtilEscapeChars.deresolveXmlEntities(creator);
+    String title = xmlUtil.evaluateToString(documentNode, "/archimedes/info/title", null);
+    if (title != null)
+      title = StringUtilEscapeChars.deresolveXmlEntities(title);
+    String language = xmlUtil.evaluateToString(documentNode, "/archimedes/info/lang", null);
+    if (language != null)
+      language = StringUtilEscapeChars.deresolveXmlEntities(language);
+    String yearStr = xmlUtil.evaluateToString(documentNode, "/archimedes/info/date", null);
+    Date date = null; 
+    if (yearStr != null && ! yearStr.equals("")) {
+      yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr);
+      yearStr = new Util().toYearStr(yearStr);  // test if possible etc
+      if (yearStr != null)
+        date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z");
+    }
+    String rights = "open access";
+    String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+    String accessRights = "free";
+    MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date);
+    mdRecord.setDocBase("archimedes");
+    mdRecord.setLicense(license);
+    mdRecord.setAccessRights(accessRights);
+    return mdRecord;
+  }
+
+  private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException {
+    String resultStr = null;
+    String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter";
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    String pageImageDirectory = null; 
+    try {
+      Node imagesDocDirectoryIndexMetaNode = xmlUtil.doc(nausikaaURLTexter + "?fn=" + imagesDocDirectory + "/index.meta");
+      pageImageDirectory = xmlUtil.evaluateToString(imagesDocDirectoryIndexMetaNode, "/resource/meta/texttool/image", null);
+    } catch (Exception e) {
+      // return null if digilib does not work
+    }
+    if (pageImageDirectory != null) {
+      resultStr = pageImageDirectory;
+    }
+    return resultStr;
+  }
+  
+  private String getIdByExistId(String eXistIdentifier) {
+    String id = null;
+    if (eXistIdentifier == null)
+      return null;
+    int firstDelimPos = eXistIdentifier.indexOf("/", 2);
+    int secondDelimPos = eXistIdentifier.indexOf("/", firstDelimPos + 1);
+    int thirdDelimPos = eXistIdentifier.indexOf(".xml", secondDelimPos + 1);
+    if (firstDelimPos == -1 || secondDelimPos == -1 || thirdDelimPos == -1)
+      id = eXistIdentifier;
+    else
+      id = eXistIdentifier.substring(secondDelimPos + 1, thirdDelimPos);
+    return id;
+  }
+  
+  private String getIdByDCIdentifier(String dcIdentifier) {
+    if (dcIdentifier == null || dcIdentifier.trim().equals(""))
+      return null;
+    // if dcIdentifier starts with "ECHO:" or "ARCHIMEDES:" then delete it
+    if (dcIdentifier.startsWith("ECHO:"))
+      dcIdentifier = dcIdentifier.substring(5);
+    if (dcIdentifier.startsWith("ARCHIMEDES:"))
+      dcIdentifier = dcIdentifier.substring(11);
+    // delete the .xml suffix if there is one
+    if (dcIdentifier.endsWith(".xml")) {
+      int size = dcIdentifier.length();
+      dcIdentifier = dcIdentifier.substring(0, size - 4);
+    }
+    return dcIdentifier;
+  }
+  
+  public NamespaceContext getEchoNsContext() {
+    NamespaceContext nsContext = new NamespaceContext() {
+      public String getNamespaceURI(String prefix) {
+        String uri;
+        if (prefix.equals("de"))
+          uri = "http://www.mpiwg-berlin.mpg.de/ns/de/1.0/";
+        else if (prefix.equals("echo"))
+          uri = "http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
+        else if (prefix.equals("dc"))
+          uri = "http://purl.org/dc/elements/1.1/";
+        else if (prefix.equals("dcterms"))
+          uri = "http://purl.org/dc/terms";
+        else if (prefix.equals("dcq"))
+          uri = "http://purl.org/dc/qualifiers/1.0/";
+        else if (prefix.equals("xhtml"))
+          uri = "http://www.w3.org/1999/xhtml";
+        else if (prefix.equals("dct"))
+          uri = "http://purl.org/dc/terms/1.0/";
+        else if (prefix.equals("xlink"))
+          uri = "http://www.w3.org/1999/xlink";
+        else if (prefix.equals("rdf"))
+          uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+        else if (prefix.equals("xsi"))
+          uri = "http://www.w3.org/2001/XMLSchema-instance";
+        else if (prefix.equals("mml"))
+          uri = "http://www.w3.org/1998/Math/MathML";
+        else
+          uri = null;
+        return uri;
+      }
+      
+      public String getPrefix(String uri) {
+        if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/de/1.0/"))
+          return "de";
+        else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"))
+          return "echo";
+        else if (uri.equals("http://purl.org/dc/elements/1.1/"))
+          return "dc";
+        else if (uri.equals("http://purl.org/dc/terms"))
+          return "dcterms";
+        else if (uri.equals("http://purl.org/dc/qualifiers/1.0/"))
+          return "dcq";
+        else if (uri.equals("http://www.w3.org/1999/xhtml"))
+          return "xhtml";
+        else if (uri.equals("http://purl.org/dc/terms/1.0/"))
+          return "dct";
+        else if (uri.equals("http://www.w3.org/1999/xlink"))
+          return "xlink";
+        else if (uri.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))
+          return "rdf";
+        else if (uri.equals("http://www.w3.org/2001/XMLSchema-instance"))
+          return "xsi";
+        else if (uri.equals("http://www.w3.org/1998/Math/MathML"))
+          return "mml";
+        else
+          return null;
+      }
+
+      public Iterator getPrefixes(String namespace) {
+        return null;
+      }
+    };
+    return nsContext;    
+  }
+
+  
+}