view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 59ff47d1e237
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.xml;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;

import javax.xml.namespace.NamespaceContext;

import org.w3c.dom.Node;

import de.mpg.mpiwg.berlin.mpdl.escidoc.MetadataRecord;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;

public class SchemaHandler {

  /**
   * 
   * @param fileName local file name to validate
   * @param docOperation
   * @return doc root node of xml file
   * @throws ApplicationException
   */
  public Node validate(String fileName, MpdlDocOperation docOperation) throws ApplicationException {
    File destFile = new File(fileName);
    String docBase = docOperation.getDocBase();
    // file name validation
    String fName = docOperation.getFileName();
    if (fName == null || fName.trim().equals(""))
      throw new ApplicationException("Your document file name is empty. Please specify a file name for your document.");
    if (! fName.endsWith(".xml")  && docBase != null &&  ! docBase.equals("diverse"))
      throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document.");
    // RelaxNG schema validation
    validateByRelaxNGSchema(destFile, docBase);
    // parse validation
    Node docNode = parse(destFile);
    // first simple validations
    validate(docNode, docBase);
    // validate metadata
    MetadataRecord mdRecord = getMetadataRecord(docNode, docOperation);
    validate(mdRecord);
    docOperation.setMdRecord(mdRecord);
    return docNode;
  }
  
  public MetadataRecord getMetadataRecord(Node documentNode, MpdlDocOperation docOperation) throws ApplicationException {
    MetadataRecord mdRecord = null;
    String eXistIdentifier = docOperation.getDestUrl(); 
    String docBase = docOperation.getDocBase();
    if (docBase != null && docBase.equals("echo")) {
      mdRecord = getMetadataRecordEcho(documentNode);
      if (mdRecord != null) {
        String identifier = mdRecord.getIdentifier();
        if (identifier == null) {
          String id = getIdByExistId(eXistIdentifier);
          mdRecord.setIdentifier("ECHO:" + id + ".xml");
        }
      }
    } else if (docBase != null && docBase.equals("archimedes")) {
      mdRecord = getMetadataRecordArchimedes(documentNode);
      if (mdRecord != null) {
        String id = getIdByExistId(eXistIdentifier);
        mdRecord.setIdentifier("ARCHIMEDES:" + id + ".xml");
      }
    } else if (docBase != null && docBase.equals("tei")) {
      mdRecord = getMetadataRecordTEI(documentNode);
      if (mdRecord != null) {
        String id = getIdByExistId(eXistIdentifier);
        mdRecord.setIdentifier("TEI:" + id + ".xml");
      }
    } else if (docBase != null && docBase.equals("diverse")) {
      mdRecord = getMetadataRecordDiverse(documentNode);
      if (mdRecord != null) {
        String id = getIdByExistId(eXistIdentifier);
        mdRecord.setIdentifier(id);
        String lang = docOperation.getLanguage();
        mdRecord.setLanguage(lang);
      }
    }
    if (mdRecord != null) {
      mdRecord.setEXistIdentifier(eXistIdentifier);
      mdRecord.setMediaType("fulltext");
    }
    return mdRecord;
  }
  
  public ArrayList<String> getPBFileNames(Node documentNode, String docBase) throws ApplicationException {
    ArrayList<String> pbFileNamesArrayStr = null;
    if (docBase != null && docBase.equals("echo")) {
      XmlUtil xmlUtil = XmlUtil.getInstance();
      NamespaceContext nsContext = getEchoNsContext();
      pbFileNamesArrayStr = xmlUtil.evaluateToStringArray(documentNode, "//echo:pb/@file", nsContext);
    } else if (docBase != null && docBase.equals("archimedes")) {
      XmlUtil xmlUtil = XmlUtil.getInstance();
      ArrayList<String> pbsStrArray = xmlUtil.evaluateToStringArray(documentNode, "//pb", null);
      if (pbsStrArray != null) {
        pbFileNamesArrayStr = new ArrayList<String>();
        int countPBs = pbsStrArray.size();
        for (int i=1; i<=countPBs; i++) {
          pbFileNamesArrayStr.add("" + i); // empty names for each page break
        }
      }
    }
    return pbFileNamesArrayStr;
  }
  
  public String getPageImgDir(MetadataRecord mdRecord) throws ApplicationException {
    String dcId = mdRecord.getIdentifier();  // dublin core identifier: is used to find the digilib image directory for this document
    String id = getIdByDCIdentifier(dcId);
    String imagesDocDirectory = "/permanent/library/" + id;
    if (mdRecord.hasArchimedesDocBase())
      imagesDocDirectory = "/permanent/archimedes/" + id;
    String echoDir = mdRecord.getEchoDir();
    if (echoDir != null)
      imagesDocDirectory = echoDir;
    String pageImgSubDir =  "pageimg"; // default name: if digilib does not answer then this name is used
    String indexMetaPageImgDir = getIndexMetaDataPageImg(imagesDocDirectory);
    if (indexMetaPageImgDir != null)
      pageImgSubDir = indexMetaPageImgDir;
    String pageImgDir = imagesDocDirectory + "/" + pageImgSubDir;
    return pageImgDir;
  }
  
  private Node parse(File file) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    Node retNode = null;
    try {
     retNode = xmlUtil.doc(file);
    } catch (ApplicationException e) {
      throw new ApplicationException("Your source file is not valid: " + e.getMessage());
    }
    return retNode;
  }
  
  private void validate(Node docNode, String docBase) throws ApplicationException {
    if (docBase.equals("diverse"))
      return;
    XmlUtil xmlUtil = XmlUtil.getInstance();
    NamespaceContext nsContext = getEchoNsContext();
    String echoTest = null;
    String archimedesTest = null;
    String teiTest = null;
    try {
      echoTest = xmlUtil.evaluateToString(docNode, "/echo:echo/echo:metadata", nsContext);
      archimedesTest = xmlUtil.evaluateToString(docNode, "/archimedes/info", null);
      teiTest = xmlUtil.evaluateToString(docNode, "/TEI/teiHeader", null);
    } catch (ApplicationException e) {
      throw new ApplicationException("Your source file is not an \"echo\" or \"archimedes\" file. Please proof that file.");
    }
    if (docBase.equals("echo") && archimedesTest != null)
      throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base.");
    if (docBase.equals("echo") && teiTest != null)
      throw new ApplicationException("Your source file is a \"TEI\" file. " + "Please specify \"TEI\" in your destination document base.");
    if (docBase.equals("archimedes") && echoTest != null)
      throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base.");
    if (docBase.equals("archimedes") && teiTest != null)
      throw new ApplicationException("Your source file is a \"archimedes\" file. " + "Please specify \"TEI\" in your destination document base.");
    if (docBase.equals("tei") && archimedesTest != null)
      throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base.");
    if (docBase.equals("tei") && echoTest != null)
      throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base.");
  }
  
  private void validateByRelaxNGSchema(File destFile, String docBase) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    if (docBase.equals("echo")) {
      URL echoSchemaUrl = getEchoRelaxNGSchemaUrl();
      xmlUtil.validateByRelaxNG(destFile, echoSchemaUrl);
    } else if (docBase.equals("tei")) {
      URL teiSchemaUrl = getTeiLiteRelaxNGSchemaUrl();
      xmlUtil.validateByRelaxNG(destFile, teiSchemaUrl);
    }
  }
  
  private URL getEchoRelaxNGSchemaUrl() throws ApplicationException {
    String echoSchemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_ECHO_RELAXNG_PATH;
    URL echoSchemaUrl = null;
    try {
      echoSchemaUrl = new URL(echoSchemaUrlStr);
    } catch (MalformedURLException e) {
      throw new ApplicationException(e);
    }
    return echoSchemaUrl;    
  }

  private URL getTeiLiteRelaxNGSchemaUrl() throws ApplicationException {
    String schemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_TEILITE_RELAXNG_PATH;
    URL schemaUrl = null;
    try {
      schemaUrl = new URL(schemaUrlStr);
    } catch (MalformedURLException e) {
      throw new ApplicationException(e);
    }
    return schemaUrl;    
  }

  private void validate(MetadataRecord mdRecord) throws ApplicationException {
    String docBase = mdRecord.getDocBase();
    if (docBase.equals("diverse"))
      return;
    String identifier = mdRecord.getIdentifier();
    String creator = mdRecord.getCreator();
    String title = mdRecord.getTitle();
    if (identifier == null || identifier.trim().equals(""))
      throw new ApplicationException("Your document file does not contain the metadata field: " + "identifier");
    if (creator == null || creator.trim().equals(""))
      throw new ApplicationException("Your document file does not contain the metadata field: " + "creator");
    if (title == null || title.trim().equals(""))
      throw new ApplicationException("Your document file does not contain the metadata field: " + "title");
  }
  
  private MetadataRecord getMetadataRecordEcho(Node documentNode) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    NamespaceContext nsContext = getEchoNsContext();
    String identifier = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:identifier", nsContext);
    if (identifier != null)
      identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier);
    String creator = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:creator", nsContext);
    if (creator != null)
      creator = StringUtilEscapeChars.deresolveXmlEntities(creator);
    String title = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:title", nsContext);
    if (title != null)
      title = StringUtilEscapeChars.deresolveXmlEntities(title);
    String language = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:language", nsContext);
    if (language != null)
      language = StringUtilEscapeChars.deresolveXmlEntities(language);
    String yearStr = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:date", nsContext);
    Date date = null; 
    if (yearStr != null && ! yearStr.equals("")) {
      yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr);
      yearStr = new Util().toYearStr(yearStr);  // test if possible etc
      if (yearStr != null)
        date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z");
    }
    String rights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:rights", nsContext);
    if (rights != null)
      rights = StringUtilEscapeChars.deresolveXmlEntities(rights);
    String license = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:license", nsContext);
    if (license != null)
      license = StringUtilEscapeChars.deresolveXmlEntities(license);
    String accessRights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:accessRights", nsContext);
    if (accessRights != null)
      accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights);
    String echoDir = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echodir", nsContext);
    if (echoDir != null)
      echoDir = StringUtilEscapeChars.deresolveXmlEntities(echoDir);
    String echoLink = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echolink", nsContext);
    if (echoLink != null)
      echoLink = StringUtilEscapeChars.deresolveXmlEntities(echoLink);
    MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date);
    mdRecord.setDocBase("echo");
    mdRecord.setLicense(license);
    mdRecord.setAccessRights(accessRights);
    mdRecord.setEchoLink(echoLink);
    mdRecord.setEchoDir(echoDir);
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordArchimedes(Node documentNode) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    String identifier = xmlUtil.evaluateToString(documentNode, "/archimedes/info/cvs_file", null);
    if (identifier != null)
      identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier);
    String creator = xmlUtil.evaluateToString(documentNode, "/archimedes/info/author", null);
    if (creator != null)
      creator = StringUtilEscapeChars.deresolveXmlEntities(creator);
    String title = xmlUtil.evaluateToString(documentNode, "/archimedes/info/title", null);
    if (title != null)
      title = StringUtilEscapeChars.deresolveXmlEntities(title);
    String language = xmlUtil.evaluateToString(documentNode, "/archimedes/info/lang", null);
    if (language != null)
      language = StringUtilEscapeChars.deresolveXmlEntities(language);
    String yearStr = xmlUtil.evaluateToString(documentNode, "/archimedes/info/date", null);
    Date date = null; 
    if (yearStr != null && ! yearStr.equals("")) {
      yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr);
      yearStr = new Util().toYearStr(yearStr);  // test if possible etc
      if (yearStr != null)
        date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z");
    }
    String rights = "open access";
    String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
    String accessRights = "free";
    MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date);
    mdRecord.setDocBase("archimedes");
    mdRecord.setLicense(license);
    mdRecord.setAccessRights(accessRights);
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordTEI(Node documentNode) throws ApplicationException {
    XmlUtil xmlUtil = XmlUtil.getInstance();
    NamespaceContext nsContext = getTeiNsContext();
    String creator = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:titleStmt/TEI:author", nsContext);
    if (creator != null)
      creator = StringUtilEscapeChars.deresolveXmlEntities(creator);
    String title = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:titleStmt/TEI:title", nsContext);
    if (title != null)
      title = StringUtilEscapeChars.deresolveXmlEntities(title);
    String language = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:profileDesc/TEI:langUsage/TEI:language/@ident", nsContext);
    if (language != null)
      language = StringUtilEscapeChars.deresolveXmlEntities(language);
    String yearStr = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:date", nsContext);
    Date date = null; 
    if (yearStr != null && ! yearStr.equals("")) {
      yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr);
      yearStr = new Util().toYearStr(yearStr);  // test if possible etc
      if (yearStr != null)
        date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z");
    }
    String rights = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:availability", nsContext);
    if (rights == null)
      rights = "open access";
    rights = StringUtilEscapeChars.deresolveXmlEntities(rights);
    String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
    String accessRights = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:availability/@status", nsContext);
    if (accessRights == null) 
      accessRights = "free";
    accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights);
    MetadataRecord mdRecord = new MetadataRecord(null, language, creator, title, null, null, "text/xml", rights, date);
    mdRecord.setDocBase("tei");
    mdRecord.setLicense(license);
    mdRecord.setAccessRights(accessRights);
    return mdRecord;
  }

  private MetadataRecord getMetadataRecordDiverse(Node documentNode) throws ApplicationException {
    String rights = "open access";
    String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
    String accessRights = "free";
    accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights);
    MetadataRecord mdRecord = new MetadataRecord(null, null, null, null, null, null, null, rights, null);
    mdRecord.setDocBase("diverse");
    mdRecord.setLicense(license);
    mdRecord.setAccessRights(accessRights);
    return mdRecord;
  }

  private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException {
    String resultStr = null;
    String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter";
    XmlUtil xmlUtil = XmlUtil.getInstance();
    String pageImageDirectory = null; 
    try {
      Node imagesDocDirectoryIndexMetaNode = xmlUtil.doc(nausikaaURLTexter + "?fn=" + imagesDocDirectory + "/index.meta");
      pageImageDirectory = xmlUtil.evaluateToString(imagesDocDirectoryIndexMetaNode, "/resource/meta/texttool/image", null);
    } catch (Exception e) {
      // return null if digilib does not work
    }
    if (pageImageDirectory != null) {
      resultStr = pageImageDirectory;
    }
    return resultStr;
  }
  
  private String getIdByExistId(String eXistIdentifier) {
    String id = null;
    if (eXistIdentifier == null)
      return null;
    int firstDelimPos = eXistIdentifier.indexOf("/", 2);
    int secondDelimPos = eXistIdentifier.indexOf("/", firstDelimPos + 1);
    int thirdDelimPos = eXistIdentifier.indexOf(".xml", secondDelimPos + 1);
    if (firstDelimPos == -1 || secondDelimPos == -1 || thirdDelimPos == -1)
      id = eXistIdentifier;
    else
      id = eXistIdentifier.substring(secondDelimPos + 1, thirdDelimPos);
    return id;
  }
  
  private String getIdByDCIdentifier(String dcIdentifier) {
    if (dcIdentifier == null || dcIdentifier.trim().equals(""))
      return null;
    // if dcIdentifier starts with "ECHO:" or "ARCHIMEDES:" then delete it
    if (dcIdentifier.startsWith("ECHO:"))
      dcIdentifier = dcIdentifier.substring(5);
    if (dcIdentifier.startsWith("ARCHIMEDES:"))
      dcIdentifier = dcIdentifier.substring(11);
    // delete the .xml suffix if there is one
    if (dcIdentifier.endsWith(".xml")) {
      int size = dcIdentifier.length();
      dcIdentifier = dcIdentifier.substring(0, size - 4);
    }
    return dcIdentifier;
  }
  
  public NamespaceContext getEchoNsContext() {
    NamespaceContext nsContext = new NamespaceContext() {
      public String getNamespaceURI(String prefix) {
        String uri;
        if (prefix.equals("de"))
          uri = "http://www.mpiwg-berlin.mpg.de/ns/de/1.0/";
        else if (prefix.equals("echo"))
          uri = "http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/";
        else if (prefix.equals("dc"))
          uri = "http://purl.org/dc/elements/1.1/";
        else if (prefix.equals("dcterms"))
          uri = "http://purl.org/dc/terms";
        else if (prefix.equals("dcq"))
          uri = "http://purl.org/dc/qualifiers/1.0/";
        else if (prefix.equals("xhtml"))
          uri = "http://www.w3.org/1999/xhtml";
        else if (prefix.equals("dct"))
          uri = "http://purl.org/dc/terms/1.0/";
        else if (prefix.equals("xlink"))
          uri = "http://www.w3.org/1999/xlink";
        else if (prefix.equals("rdf"))
          uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
        else if (prefix.equals("xsi"))
          uri = "http://www.w3.org/2001/XMLSchema-instance";
        else if (prefix.equals("mml"))
          uri = "http://www.w3.org/1998/Math/MathML";
        else
          uri = null;
        return uri;
      }
      
      public String getPrefix(String uri) {
        if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/de/1.0/"))
          return "de";
        else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"))
          return "echo";
        else if (uri.equals("http://purl.org/dc/elements/1.1/"))
          return "dc";
        else if (uri.equals("http://purl.org/dc/terms"))
          return "dcterms";
        else if (uri.equals("http://purl.org/dc/qualifiers/1.0/"))
          return "dcq";
        else if (uri.equals("http://www.w3.org/1999/xhtml"))
          return "xhtml";
        else if (uri.equals("http://purl.org/dc/terms/1.0/"))
          return "dct";
        else if (uri.equals("http://www.w3.org/1999/xlink"))
          return "xlink";
        else if (uri.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))
          return "rdf";
        else if (uri.equals("http://www.w3.org/2001/XMLSchema-instance"))
          return "xsi";
        else if (uri.equals("http://www.w3.org/1998/Math/MathML"))
          return "mml";
        else
          return null;
      }

      public Iterator getPrefixes(String namespace) {
        return null;
      }
    };
    return nsContext;    
  }

  public NamespaceContext getTeiNsContext() {
    NamespaceContext nsContext = new NamespaceContext() {
      public String getNamespaceURI(String prefix) {
        String uri;
        if (prefix.equals("TEI"))
          uri = "http://www.tei-c.org/ns/1.0";
        else if (prefix.equals("xhtml"))
          uri = "http://www.w3.org/1999/xhtml";
        else if (prefix.equals("xlink"))
          uri = "http://www.w3.org/1999/xlink";
        else if (prefix.equals("mml"))
          uri = "http://www.w3.org/1998/Math/MathML";
        else
          uri = null;
        return uri;
      }
      
      public String getPrefix(String uri) {
        if (uri.equals("http://www.tei-c.org/ns/1.0"))
          return "TEI";
        else if (uri.equals("http://www.w3.org/1999/xhtml"))
          return "xhtml";
        else if (uri.equals("http://www.w3.org/1999/xlink"))
          return "xlink";
        else if (uri.equals("http://www.w3.org/1998/Math/MathML"))
          return "mml";
        else
          return null;
      }

      public Iterator getPrefixes(String namespace) {
        return null;
      }
    };
    return nsContext;    
  }
  
}