Mercurial > hg > mpdl-group
changeset 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/mpdl-system.properties Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/mpdl-system.properties Fri Mar 11 13:33:26 2011 +0100 @@ -5,6 +5,7 @@ exist.adminUserName=admin exist.adminUserPW= exist.echoRelaxNGPath=/exist/rest/db/mpdl/schema/echo/echo.rnc +exist.teiRelaxNGPath=/exist/rest/db/mpdl/schema/tei/tei_allPlus.rnc # eSciDoc settings escidoc.hostname=euler.mpiwg-berlin.mpg.de
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Fri Mar 11 13:33:26 2011 +0100 @@ -44,7 +44,7 @@ private MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler; private ESciDocIngestor eSciDocIngestor; - private String[] docBases = {"archimedes", "echo"}; + private String[] docBases = {"archimedes", "echo", "tei"}; private String[] languages = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"}; private String documentRootCollectionMorph = "/db/mpdl/documents/morph"; private String documentRootCollectionStandard = "/db/mpdl/documents/standard"; @@ -282,17 +282,19 @@ FilenameFilter filter = new FilenameFilterExtension("xml"); File[] files = localFileDir.listFiles(filter); System.out.println("Adding all documents in path: \"" + localFileDirStr + "\" to eXist collection: \"" + documentCollection + "\" ..."); - for (int k=0; k < files.length; k++) { - File f = files[k]; - String localFileNameWithoutPath = f.getName(); - String fullLocalFileName = f.getPath(); - String srcUrl = "file://" + fullLocalFileName; - MpdlDocOperation docOperation = new MpdlDocOperation("updateExist", srcUrl, null, docBase, language, localFileNameWithoutPath); - long begin = new Date().getTime(); - doOperation(docOperation); - long end = new Date().getTime(); - System.out.println("Added document \"" + fullLocalFileName + "\" to eXist collection: \"" + documentCollection + "\" (" + (end - begin) + " ms)" ); - counter++; + if (files != null) { + for (int k=0; k < files.length; k++) { + File f = files[k]; + String localFileNameWithoutPath = f.getName(); + String fullLocalFileName = f.getPath(); + String srcUrl = "file://" + fullLocalFileName; + MpdlDocOperation docOperation = new MpdlDocOperation("updateExist", srcUrl, null, docBase, language, localFileNameWithoutPath); + long begin = new Date().getTime(); + doOperation(docOperation); + long end = new Date().getTime(); + System.out.println("Added document \"" + fullLocalFileName + "\" to eXist collection: \"" + documentCollection + "\" (" + (end - begin) + " ms)" ); + counter++; + } } } } @@ -311,30 +313,32 @@ FilenameFilter filter = new FilenameFilterExtension("xml"); File[] files = localFileDir.listFiles(filter); System.out.println("Generating Pdf/Html documents in path: \"" + localFileDirStr + "\" ..."); - for (int k=0; k < files.length; k++) { - File f = files[k]; - String localFileName = f.getName(); - String fullLocalFileName = f.getPath(); - String srcUrl = "file://" + fullLocalFileName; - String localFileNameWithoutExtension = localFileName.substring(0, localFileName.length() - 4); // without ".xml" - String fullLocalPdfFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents/" + docBase + "/" + language + "/" + localFileNameWithoutExtension + ".pdf"; - File localPdfFile = new File(fullLocalPdfFileName); - boolean pdfFileAlreadyExists = localPdfFile.exists(); - // generate Pdf/Html file only if pdf file does not already exist - if (! pdfFileAlreadyExists) { - MpdlDocOperation docOperation = new MpdlDocOperation("generatePdf", srcUrl, null, docBase, language, localFileName); - SchemaHandler schemaHandler = new SchemaHandler(); - schemaHandler.validate(fullLocalFileName, docOperation); - long begin = new Date().getTime(); - MetadataRecord mdRecord = docOperation.getMdRecord(); // after validation, docOperation has a mdRecord - mpdlRenderer.createFile(true, true, "text", mdRecord); // generate Pdf/Html document - long end = new Date().getTime(); - System.out.println("Generate Pdf/Html document for: \"" + fullLocalFileName + "\" (" + (end - begin) + " ms)" ); - counter++; - try { - Thread.sleep(60000); // delay so that called servers (digilib, eXist) are not stressed too much - } catch (InterruptedException e) { - throw new ApplicationException(e); + if (files != null) { + for (int k=0; k < files.length; k++) { + File f = files[k]; + String localFileName = f.getName(); + String fullLocalFileName = f.getPath(); + String srcUrl = "file://" + fullLocalFileName; + String localFileNameWithoutExtension = localFileName.substring(0, localFileName.length() - 4); // without ".xml" + String fullLocalPdfFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents/" + docBase + "/" + language + "/" + localFileNameWithoutExtension + ".pdf"; + File localPdfFile = new File(fullLocalPdfFileName); + boolean pdfFileAlreadyExists = localPdfFile.exists(); + // generate Pdf/Html file only if pdf file does not already exist + if (! pdfFileAlreadyExists) { + MpdlDocOperation docOperation = new MpdlDocOperation("generatePdf", srcUrl, null, docBase, language, localFileName); + SchemaHandler schemaHandler = new SchemaHandler(); + schemaHandler.validate(fullLocalFileName, docOperation); + long begin = new Date().getTime(); + MetadataRecord mdRecord = docOperation.getMdRecord(); // after validation, docOperation has a mdRecord + mpdlRenderer.createFile(true, true, "text", mdRecord); // generate Pdf/Html document + long end = new Date().getTime(); + System.out.println("Generate Pdf/Html document for: \"" + fullLocalFileName + "\" (" + (end - begin) + " ms)" ); + counter++; + try { + Thread.sleep(60000); // delay so that called servers (digilib, eXist) are not stressed too much + } catch (InterruptedException e) { + throw new ApplicationException(e); + } } } }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java Fri Mar 11 13:33:26 2011 +0100 @@ -12,8 +12,10 @@ import com.sleepycat.je.OperationStatus; import com.sleepycat.je.Transaction; +import de.mpg.mpiwg.berlin.mpdl.schedule.MpdlDocOperation; import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.xml.SchemaHandler; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.externalObjects.db.DbEnvExternalObjects; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; @@ -369,8 +371,9 @@ getInstance(); instance.beginOperation(); System.out.print("Start ..."); - instance.deleteSampleData(); - instance.createSampleData(); + // instance.validateSampleDoc(); + // instance.deleteSampleData(); + // instance.createSampleData(); // instance.updateSampleData(); instance.readSampleData(); instance.end(); @@ -380,6 +383,13 @@ System.out.println("Needed time: " + elapsedTime + " seconds"); } + private void validateSampleDoc() throws ApplicationException { + SchemaHandler schemaHandler = new SchemaHandler(); + MpdlDocOperation docOp = new MpdlDocOperation("bla", "bla", "", "tei", "en", "Test_1789.xml"); + String localFileName = "/Users/jwillenborg/texts/mpdl/documents/tei/en/Test_1789.xml"; + schemaHandler.validate(localFileName, docOp); + } + private void deleteSampleData() throws ApplicationException { String xmlNodeId1 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]"; String objectXmlStr1 = @@ -433,7 +443,7 @@ e.setDocumentId("/archimedes/it/l223.xml"); e.setPageNumber("17"); e.setXmlNodeId(sId); - e.setContent("<note>This is a test note to element " + sId + " with <seg xlink:href=\"http://slime.de\">this external link</seg>" + "</note>"); + e.setContent("<note>This is a test note to element " + sId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>"); createExternalElement(e); ExtElement e2 = new ExtElement(); @@ -475,7 +485,7 @@ "pageNumber=\"" + "17" + "\" " + "xmlNodeId=\"" + xmlNodeId + "\"" + ">" + - "<content>" + "<note>This is a test note to element " + xmlNodeId + " with <seg xlink:href=\"http://slime.de\">this external link</seg>" + "</note>" + "</content>" + + "<content>" + "<note>This is a test note to element " + xmlNodeId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>" + "</content>" + "</object>"; ExtElement e = ExtElement.parseXmlStr(objectXmlStr); e.setModificationDate(now);
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java Fri Mar 11 13:33:26 2011 +0100 @@ -24,6 +24,7 @@ public static String MPDL_EXIST_ADMIN_USER_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("exist.adminUserName"); public static String MPDL_EXIST_ADMIN_USER_PW = MPDL_SYSTEM_PROPERTIES.getProperty("exist.adminUserPW"); public static String MPDL_ECHO_RELAXNG_PATH = MPDL_SYSTEM_PROPERTIES.getProperty("exist.echoRelaxNGPath"); + public static String MPDL_TEILITE_RELAXNG_PATH = MPDL_SYSTEM_PROPERTIES.getProperty("exist.teiRelaxNGPath"); // eSciDoc settings public static String MPDL_ESCIDOC_HOST_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.hostname");
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java Fri Mar 11 13:33:26 2011 +0100 @@ -48,8 +48,10 @@ } } } - // if not found in MorphologyCache use Snowball + // if not found then use the term itself as the stem if (stem == null) { + stem = term; + /* Snowball stemming: if not found in MorphologyCache use Snowball stem = stemBySnowball(term, language); // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball form to the dynamic morphology cache if ((! stem.equals(term)) && stem.length() > 2) { @@ -64,6 +66,7 @@ Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e); } } + */ } return stem; }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java Fri Mar 11 13:33:26 2011 +0100 @@ -45,6 +45,8 @@ System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { + // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters + charactersStr = zwsp(charactersStr); if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); @@ -151,6 +153,20 @@ outputXmlFragment += outStr; } + /** + * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) + * @param str + * @return + */ + private String zwsp(String str) { + // based on Unicode 3.2 + String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; + String regex = "(" + ideographic + ")(" + ideographic + ")"; + String retStr = str.replaceAll(regex, "$1\u200b$2"); + retStr = retStr.replaceAll(regex, "$1\u200b$2"); + return retStr; + } + private class Element { private int type; private String name;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Fri Mar 11 13:33:26 2011 +0100 @@ -325,16 +325,18 @@ private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); - ArrayList<Lemma> lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName); - lemmasStatic.addAll(lemmasDynamic); + // is set off because Snowball is not used anymore + // ArrayList<Lemma> lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName); + // lemmasStatic.addAll(lemmasDynamic); return lemmasStatic; } private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); - ArrayList<Form> formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName); - formsStatic.addAll(formsDynamic); + // is set off because Snowball is not used anymore + // ArrayList<Form> formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName); + // formsStatic.addAll(formsDynamic); return formsStatic; }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java Fri Mar 11 13:33:26 2011 +0100 @@ -381,6 +381,8 @@ String pbTag = "echo:pb"; if (docBase != null && docBase.equals("archimedes")) pbTag = "pb"; + else if (docBase != null && docBase.equals("tei")) + pbTag = "TEI:pb"; try { HttpClient httpClient = new HttpClient(); String requestName = "/mpdl/interface/xquery.xql?document=" + docName + "&xquery=count(//" + pbTag + ")";
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java Tue Feb 22 16:03:45 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java Fri Mar 11 13:33:26 2011 +0100 @@ -23,7 +23,7 @@ /** * - * @param fileName + * @param fileName local file name to validate * @param docOperation * @return doc root node of xml file * @throws ApplicationException @@ -69,6 +69,12 @@ String id = getIdByExistId(eXistIdentifier); mdRecord.setIdentifier("ARCHIMEDES:" + id + ".xml"); } + } else if (docBase != null && docBase.equals("tei")) { + mdRecord = getMetadataRecordTEI(documentNode); + if (mdRecord != null) { + String id = getIdByExistId(eXistIdentifier); + mdRecord.setIdentifier("TEI:" + id + ".xml"); + } } if (mdRecord != null) { mdRecord.setEXistIdentifier(eXistIdentifier); @@ -130,16 +136,26 @@ NamespaceContext nsContext = getEchoNsContext(); String echoTest = null; String archimedesTest = null; + String teiTest = null; try { echoTest = xmlUtil.evaluateToString(docNode, "/echo:echo/echo:metadata", nsContext); archimedesTest = xmlUtil.evaluateToString(docNode, "/archimedes/info", null); + teiTest = xmlUtil.evaluateToString(docNode, "/TEI/teiHeader", null); } catch (ApplicationException e) { throw new ApplicationException("Your source file is not an \"echo\" or \"archimedes\" file. Please proof that file."); } if (docBase.equals("echo") && archimedesTest != null) throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base."); + if (docBase.equals("echo") && teiTest != null) + throw new ApplicationException("Your source file is a \"TEI\" file. " + "Please specify \"TEI\" in your destination document base."); if (docBase.equals("archimedes") && echoTest != null) throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base."); + if (docBase.equals("archimedes") && teiTest != null) + throw new ApplicationException("Your source file is a \"archimedes\" file. " + "Please specify \"TEI\" in your destination document base."); + if (docBase.equals("tei") && archimedesTest != null) + throw new ApplicationException("Your source file is an \"archimedes\" file. " + "Please specify \"archimedes\" in your destination document base."); + if (docBase.equals("tei") && echoTest != null) + throw new ApplicationException("Your source file is an \"echo\" file. " + "Please specify \"echo\" in your destination document base."); } private void validateByRelaxNGSchema(File destFile, String docBase) throws ApplicationException { @@ -147,6 +163,9 @@ if (docBase.equals("echo")) { URL echoSchemaUrl = getEchoRelaxNGSchemaUrl(); xmlUtil.validateByRelaxNG(destFile, echoSchemaUrl); + } else if (docBase.equals("tei")) { + URL teiSchemaUrl = getTeiLiteRelaxNGSchemaUrl(); + xmlUtil.validateByRelaxNG(destFile, teiSchemaUrl); } } @@ -161,6 +180,17 @@ return echoSchemaUrl; } + private URL getTeiLiteRelaxNGSchemaUrl() throws ApplicationException { + String schemaUrlStr = "http://" + MpdlConstants.MPDL_EXIST_HOST_NAME + ":" + MpdlConstants.MPDL_EXIST_PORT + MpdlConstants.MPDL_TEILITE_RELAXNG_PATH; + URL schemaUrl = null; + try { + schemaUrl = new URL(schemaUrlStr); + } catch (MalformedURLException e) { + throw new ApplicationException(e); + } + return schemaUrl; + } + private void validate(MetadataRecord mdRecord) throws ApplicationException { String identifier = mdRecord.getIdentifier(); String creator = mdRecord.getCreator(); @@ -252,6 +282,42 @@ return mdRecord; } + private MetadataRecord getMetadataRecordTEI(Node documentNode) throws ApplicationException { + XmlUtil xmlUtil = XmlUtil.getInstance(); + NamespaceContext nsContext = getTeiNsContext(); + String creator = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:titleStmt/TEI:author", nsContext); + if (creator != null) + creator = StringUtilEscapeChars.deresolveXmlEntities(creator); + String title = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:titleStmt/TEI:title", nsContext); + if (title != null) + title = StringUtilEscapeChars.deresolveXmlEntities(title); + String language = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:profileDesc/TEI:langUsage/TEI:language/@ident", nsContext); + if (language != null) + language = StringUtilEscapeChars.deresolveXmlEntities(language); + String yearStr = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:date", nsContext); + Date date = null; + if (yearStr != null && ! yearStr.equals("")) { + yearStr = StringUtilEscapeChars.deresolveXmlEntities(yearStr); + yearStr = new Util().toYearStr(yearStr); // test if possible etc + if (yearStr != null) + date = XmlUtil.getInstance().toDate(yearStr + "-01-01T00:00:00.000Z"); + } + String rights = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:availability", nsContext); + if (rights == null) + rights = "open access"; + rights = StringUtilEscapeChars.deresolveXmlEntities(rights); + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = xmlUtil.evaluateToString(documentNode, "/TEI:TEI/TEI:teiHeader/TEI:fileDesc/TEI:publicationStmt/TEI:availability/@status", nsContext); + if (accessRights == null) + accessRights = "free"; + accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights); + MetadataRecord mdRecord = new MetadataRecord(null, language, creator, title, null, null, "text/xml", rights, date); + mdRecord.setDocBase("tei"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + return mdRecord; + } + private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException { String resultStr = null; String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter"; @@ -364,5 +430,41 @@ return nsContext; } + public NamespaceContext getTeiNsContext() { + NamespaceContext nsContext = new NamespaceContext() { + public String getNamespaceURI(String prefix) { + String uri; + if (prefix.equals("TEI")) + uri = "http://www.tei-c.org/ns/1.0"; + else if (prefix.equals("xhtml")) + uri = "http://www.w3.org/1999/xhtml"; + else if (prefix.equals("xlink")) + uri = "http://www.w3.org/1999/xlink"; + else if (prefix.equals("mml")) + uri = "http://www.w3.org/1998/Math/MathML"; + else + uri = null; + return uri; + } + + public String getPrefix(String uri) { + if (uri.equals("http://www.tei-c.org/ns/1.0")) + return "TEI"; + else if (uri.equals("http://www.w3.org/1999/xhtml")) + return "xhtml"; + else if (uri.equals("http://www.w3.org/1999/xlink")) + return "xlink"; + else if (uri.equals("http://www.w3.org/1998/Math/MathML")) + return "mml"; + else + return null; + } + + public Iterator getPrefixes(String namespace) { + return null; + } + }; + return nsContext; + } }