Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 59ff47d1e237 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,368 @@ +package de.mpg.mpiwg.berlin.mpdl.xml; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; + +import javax.xml.namespace.NamespaceContext; + +import org.w3c.dom.Node; + +import de.mpg.mpiwg.berlin.mpdl.escidoc.MetadataRecord; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +public class SchemaHandler { + + /** + * + * @param fileName + * @param docOperation + * @return doc root node of xml file + * @throws ApplicationException + */ + public Node validate(String fileName, MpdlDocOperation docOperation) throws ApplicationException { + File destFile = new File(fileName); + String docBase = docOperation.getDocBase(); + // file name validation + String fName = docOperation.getFileName(); + if (fName == null || fName.trim().equals("")) + throw new ApplicationException("Your document file name is empty. Please specify a file name for your document."); + if (! fName.endsWith(".xml")) + throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document."); + // RelaxNG schema validation + validateByRelaxNGSchema(destFile, docBase); + // parse validation + Node docNode = parse(destFile); + // first simple validations + validate(docNode, docBase); + // validate metadata + MetadataRecord mdRecord = getMetadataRecord(docNode, docOperation); + validate(mdRecord); + docOperation.setMdRecord(mdRecord); + return docNode; + } + + public MetadataRecord getMetadataRecord(Node documentNode, MpdlDocOperation docOperation) throws ApplicationException { + MetadataRecord mdRecord = null; + String eXistIdentifier = docOperation.getDestUrl(); + String docBase = docOperation.getDocBase(); + if (docBase != null && docBase.equals("echo")) { + mdRecord = getMetadataRecordEcho(documentNode); + if (mdRecord != null) { + String identifier = mdRecord.getIdentifier(); + if (identifier == null) { + String id = getIdByExistId(eXistIdentifier); + mdRecord.setIdentifier("ECHO:" + id + ".xml"); + } + } + } else if (docBase != null && docBase.equals("archimedes")) { + mdRecord = getMetadataRecordArchimedes(documentNode); + if (mdRecord != null) { + String id = getIdByExistId(eXistIdentifier); + mdRecord.setIdentifier("ARCHIMEDES:" + id + ".xml"); + } + } + if (mdRecord != null) { + mdRecord.setEXistIdentifier(eXistIdentifier); + mdRecord.setMediaType("fulltext"); + } + return mdRecord; + } + + public ArrayList<String> getPBFileNames(Node documentNode, String docBase) throws ApplicationException { + ArrayList<String> pbFileNamesArrayStr = null; + if (docBase != null && docBase.equals("echo")) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + pbFileNamesArrayStr = xmlUtil.evaluateToStringArray(documentNode, "//echo:pb/@file", nsContext); + } else if (docBase != null && docBase.equals("archimedes")) { + XmlUtil xmlUtil = XmlUtil.getInstance(); + ArrayList<String> pbsStrArray = xmlUtil.evaluateToStringArray(documentNode, "//pb", null); + if (pbsStrArray != null) { + pbFileNamesArrayStr = new ArrayList<String>(); + int countPBs = pbsStrArray.size(); + for (int i=1; i<=countPBs; i++) { + pbFileNamesArrayStr.add("" + i); // empty names for each page break + } + } + } + return pbFileNamesArrayStr; + } + + public String getPageImgDir(MetadataRecord mdRecord) throws ApplicationException { + String dcId = mdRecord.getIdentifier(); // dublin core identifier: is used to find the digilib image directory for this document + String id = getIdByDCIdentifier(dcId); + String imagesDocDirectory = "/permanent/library/" + id; + if (mdRecord.hasArchimedesDocBase()) + imagesDocDirectory = "/permanent/archimedes/" + id; + String echoDir = mdRecord.getEchoDir(); + if (echoDir != null) + imagesDocDirectory = echoDir; + String pageImgSubDir = "pageimg"; // default name: if digilib does not answer then this name is used + String indexMetaPageImgDir = getIndexMetaDataPageImg(imagesDocDirectory); + if (indexMetaPageImgDir != null) + pageImgSubDir = indexMetaPageImgDir; + String pageImgDir = imagesDocDirectory + "/" + pageImgSubDir; + return pageImgDir; + } + + private Node parse(File file) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + Node retNode = null; + try { + retNode = xmlUtil.doc(file); + } catch (ApplicationException e) { + throw new ApplicationException("Your source file is not valid: " + e.getMessage()); + } + return retNode; + } + + private void validate(Node docNode, String docBase) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + String echoTest = null; + String archimedesTest = null; + try { + echoTest = xmlUtil.evaluateToString(docNode, "/echo:echo/echo:metadata", nsContext); + archimedesTest = xmlUtil.evaluateToString(docNode, "/archimedes/info", null); + } catch (ApplicationException e) { + throw new ApplicationException("Your source file is not an \"echo\" or \"archimedes\" file. Please proof that file."); + } + if (docBase.equals("echo") && archimedesTest != null) + throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base."); + if (docBase.equals("archimedes") && echoTest != null) + throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base."); + } + + private void validateByRelaxNGSchema(File destFile, String docBase) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + if (docBase.equals("echo")) { + URL echoSchemaUrl = getEchoRelaxNGSchemaUrl(); + xmlUtil.validateByRelaxNG(destFile, echoSchemaUrl); + } + } + + private URL getEchoRelaxNGSchemaUrl() throws ApplicationException { + String echoSchemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_ECHO_RELAXNG_PATH; + URL echoSchemaUrl = null; + try { + echoSchemaUrl = new URL(echoSchemaUrlStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + return echoSchemaUrl; + } + + private void validate(MetadataRecord mdRecord) throws ApplicationException { + String identifier = mdRecord.getIdentifier(); + String creator = mdRecord.getCreator(); + String title = mdRecord.getTitle(); + if (identifier == null || identifier.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "identifier"); + if (creator == null || creator.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "creator"); + if (title == null || title.trim().equals("")) + throw new ApplicationException("Your document file does not contain the metadata field: " + "title"); + } + + private MetadataRecord getMetadataRecordEcho(Node documentNode) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getEchoNsContext(); + String identifier = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:identifier", nsContext); + if (identifier != null) + identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier); + String creator = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:creator", nsContext); + if (creator != null) + creator = StringUtilEscapeChars.deresolveXmlEntities(creator); + String title = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:title", nsContext); + if (title != null) + title = StringUtilEscapeChars.deresolveXmlEntities(title); + String language = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:language", nsContext); + if (language != null) + language = StringUtilEscapeChars.deresolveXmlEntities(language); + String yearStr = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:date", nsContext); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) + date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z"); + } + String rights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:rights", nsContext); + if (rights != null) + rights = StringUtilEscapeChars.deresolveXmlEntities(rights); + String license = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:license", nsContext); + if (license != null) + license = StringUtilEscapeChars.deresolveXmlEntities(license); + String accessRights = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/dcterms:accessRights", nsContext); + if (accessRights != null) + accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights); + String echoDir = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echodir", nsContext); + if (echoDir != null) + echoDir = StringUtilEscapeChars.deresolveXmlEntities(echoDir); + String echoLink = xmlUtil.evaluateToString(documentNode, "/echo:echo/echo:metadata/echo:echolink", nsContext); + if (echoLink != null) + echoLink = StringUtilEscapeChars.deresolveXmlEntities(echoLink); + MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date); + mdRecord.setDocBase("echo"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + mdRecord.setEchoLink(echoLink); + mdRecord.setEchoDir(echoDir); + return mdRecord; + } + + private MetadataRecord getMetadataRecordArchimedes(Node documentNode) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + String identifier = xmlUtil.evaluateToString(documentNode, "/archimedes/info/cvs_file", null); + if (identifier != null) + identifier = StringUtilEscapeChars.deresolveXmlEntities(identifier); + String creator = xmlUtil.evaluateToString(documentNode, "/archimedes/info/author", null); + if (creator != null) + creator = StringUtilEscapeChars.deresolveXmlEntities(creator); + String title = xmlUtil.evaluateToString(documentNode, "/archimedes/info/title", null); + if (title != null) + title = StringUtilEscapeChars.deresolveXmlEntities(title); + String language = xmlUtil.evaluateToString(documentNode, "/archimedes/info/lang", null); + if (language != null) + language = StringUtilEscapeChars.deresolveXmlEntities(language); + String yearStr = xmlUtil.evaluateToString(documentNode, "/archimedes/info/date", null); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) + date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z"); + } + String rights = "open access"; + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = "free"; + MetadataRecord mdRecord = new MetadataRecord(identifier, language, creator, title, null, null, "text/xml", rights, date); + mdRecord.setDocBase("archimedes"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + return mdRecord; + } + + private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException { + String resultStr = null; + String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter"; + XmlUtil xmlUtil = XmlUtil.getInstance(); + String pageImageDirectory = null; + try { + Node imagesDocDirectoryIndexMetaNode = xmlUtil.doc(nausikaaURLTexter + "?fn=" + imagesDocDirectory + "/index.meta"); + pageImageDirectory = xmlUtil.evaluateToString(imagesDocDirectoryIndexMetaNode, "/resource/meta/texttool/image", null); + } catch (Exception e) { + // return null if digilib does not work + } + if (pageImageDirectory != null) { + resultStr = pageImageDirectory; + } + return resultStr; + } + + private String getIdByExistId(String eXistIdentifier) { + String id = null; + if (eXistIdentifier == null) + return null; + int firstDelimPos = eXistIdentifier.indexOf("/", 2); + int secondDelimPos = eXistIdentifier.indexOf("/", firstDelimPos + 1); + int thirdDelimPos = eXistIdentifier.indexOf(".xml", secondDelimPos + 1); + if (firstDelimPos == -1 || secondDelimPos == -1 || thirdDelimPos == -1) + id = eXistIdentifier; + else + id = eXistIdentifier.substring(secondDelimPos + 1, thirdDelimPos); + return id; + } + + private String getIdByDCIdentifier(String dcIdentifier) { + if (dcIdentifier == null || dcIdentifier.trim().equals("")) + return null; + // if dcIdentifier starts with "ECHO:" or "ARCHIMEDES:" then delete it + if (dcIdentifier.startsWith("ECHO:")) + dcIdentifier = dcIdentifier.substring(5); + if (dcIdentifier.startsWith("ARCHIMEDES:")) + dcIdentifier = dcIdentifier.substring(11); + // delete the .xml suffix if there is one + if (dcIdentifier.endsWith(".xml")) { + int size = dcIdentifier.length(); + dcIdentifier = dcIdentifier.substring(0, size - 4); + } + return dcIdentifier; + } + + public NamespaceContext getEchoNsContext() { + NamespaceContext nsContext = new NamespaceContext() { + public String getNamespaceURI(String prefix) { + String uri; + if (prefix.equals("de")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/de/1.0/"; + else if (prefix.equals("echo")) + uri = "http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/"; + else if (prefix.equals("dc")) + uri = "http://purl.org/dc/elements/1.1/"; + else if (prefix.equals("dcterms")) + uri = "http://purl.org/dc/terms"; + else if (prefix.equals("dcq")) + uri = "http://purl.org/dc/qualifiers/1.0/"; + else if (prefix.equals("xhtml")) + uri = "http://www.w3.org/1999/xhtml"; + else if (prefix.equals("dct")) + uri = "http://purl.org/dc/terms/1.0/"; + else if (prefix.equals("xlink")) + uri = "http://www.w3.org/1999/xlink"; + else if (prefix.equals("rdf")) + uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + else if (prefix.equals("xsi")) + uri = "http://www.w3.org/2001/XMLSchema-instance"; + else if (prefix.equals("mml")) + uri = "http://www.w3.org/1998/Math/MathML"; + else + uri = null; + return uri; + } + + public String getPrefix(String uri) { + if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/de/1.0/")) + return "de"; + else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/")) + return "echo"; + else if (uri.equals("http://purl.org/dc/elements/1.1/")) + return "dc"; + else if (uri.equals("http://purl.org/dc/terms")) + return "dcterms"; + else if (uri.equals("http://purl.org/dc/qualifiers/1.0/")) + return "dcq"; + else if (uri.equals("http://www.w3.org/1999/xhtml")) + return "xhtml"; + else if (uri.equals("http://purl.org/dc/terms/1.0/")) + return "dct"; + else if (uri.equals("http://www.w3.org/1999/xlink")) + return "xlink"; + else if (uri.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#")) + return "rdf"; + else if (uri.equals("http://www.w3.org/2001/XMLSchema-instance")) + return "xsi"; + else if (uri.equals("http://www.w3.org/1998/Math/MathML")) + return "mml"; + else + return null; + } + + public Iterator getPrefixes(String namespace) { + return null; + } + }; + return nsContext; + } + + +}