# HG changeset patch # User Josef Willenborg # Date 1317134457 -7200 # Node ID 257f67be5c0085a2201683b531740f272192aac6 # Parent e99964f390e4f584bc210925a7275362da2cd00d diverse Fehlerbehebungen diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/.DS_Store Binary file software/eXist/mpdl-modules/src/de/.DS_Store has changed diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java Tue Sep 27 16:40:57 2011 +0200 @@ -44,7 +44,7 @@ private MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler; private ESciDocIngestor eSciDocIngestor; - private String[] docBases = {"archimedes", "echo", "tei"}; + private String[] docBases = {"archimedes", "diverse", "echo", "tei"}; private String[] languages = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"}; private String documentRootCollectionMorph = "/db/mpdl/documents/morph"; private String documentRootCollectionStandard = "/db/mpdl/documents/standard"; @@ -92,7 +92,7 @@ deleteDocumentCollections(); createDocumentCollections(); - saveDocumentFiles(); + // saveDocumentFiles(); endOperation(); System.out.println("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" ); } @@ -204,10 +204,13 @@ // validation docOperation.setStatus("validate document: " + eXistIdentifier); schemaHandler.validate(destFileName, docOperation); - // save regularizations of the document - docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server"); - RegularizationManager regManager = RegularizationManager.getInstance(); - regManager.saveRegularizations(language, destFileName); + String docBase = docOperation.getDocBase(); + if (docBase != null && docBase.equals("echo")) { + // save regularizations of the document + docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server"); + RegularizationManager regManager = RegularizationManager.getInstance(); + regManager.saveRegularizations(language, destFileName); + } // perform operation on eXist docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); mpdlXmlRpcDocHandler.saveDocumentFile(docOperation); @@ -228,9 +231,10 @@ String operationName = docOperation.getName(); String eXistIdentifier = docOperation.getDestUrl(); String fileName = docOperation.getFileName(); + String docBase = docOperation.getDocBase(); if (fileName == null || fileName.trim().equals("")) throw new ApplicationException("Your document file name is empty. Please specify a file name for your document."); - if (! fileName.endsWith(".xml")) + if (! fileName.endsWith(".xml") && docBase != null && ! docBase.equals("diverse")) throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document."); boolean docExists = mpdlXmlRpcDocHandler.documentExists(docOperation); if (! docExists) @@ -238,13 +242,18 @@ // perform operation docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server"); // delete file on local file system: xml, pdf and html - String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + int lastDot = eXistIdentifier.lastIndexOf("."); + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, lastDot); String destFileNameXml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier; FileUtil.getInstance().deleteFile(destFileNameXml); boolean includePdf = docOperation.includePdf(); if (includePdf) { + String eXistIdentifierExtension = eXistIdentifier.substring(lastDot + 1); String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + if (eXistIdentifierExtension != null && eXistIdentifierExtension.equals("html")) { + destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-gen.html"; + } FileUtil.getInstance().deleteFile(destFileNamePdf); FileUtil.getInstance().deleteFile(destFileNameHtml); } diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java Tue Sep 27 16:40:57 2011 +0200 @@ -20,6 +20,10 @@ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String docEXistIdentifier = request.getParameter("doc"); + if (docEXistIdentifier == null || docEXistIdentifier.isEmpty()) { + write(response, "Parameter: \"doc\" is not set. Please set parameter \"doc\"."); + return; + } String docFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + docEXistIdentifier; File docFile = new File(docFileName); if (docFile.exists()) diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Sep 27 16:40:57 2011 +0200 @@ -301,7 +301,7 @@ if (mode == DISPLAY) mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); else if (mode == DICTIONARY) - mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.CELEX); + mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); else if (mode == SEARCH) mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); while (token != null) { diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Sep 27 16:40:57 2011 +0200 @@ -12,7 +12,6 @@ private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 1024; private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); - private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon private boolean isInNotWordDelimMode = false; private int offset = 0, bufferIndex = 0, dataLen = 0; private char[] buffer = new char[MAX_WORD_LEN]; @@ -31,22 +30,12 @@ this.normalizer = normalizer; } - public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { - this.regWithoutSemicolon = regWithoutSemicolon; - } - - public boolean isRegWithoutSemicolon() { - return regWithoutSemicolon; - } - /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ protected boolean isTokenChar(char c) { boolean isTokenChar = true; - if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later - return true; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Sep 27 16:40:57 2011 +0200 @@ -16,7 +16,6 @@ public class MpdlTokenizerAnalyzer extends Analyzer { protected String language = MpdlConstants.DEFAULT_LANGUAGE; protected MpdlNormalizer normalizer = null; - private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon public MpdlTokenizerAnalyzer(String language) { this.language = language; @@ -28,17 +27,8 @@ this.normalizer = normalizer; } - public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { - this.regWithoutSemicolon = regWithoutSemicolon; - } - - public boolean isRegWithoutSemicolon() { - return regWithoutSemicolon; - } - public TokenStream tokenStream(String fieldName, Reader reader) { MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); - tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); @@ -50,7 +40,6 @@ try { Reader reader = new StringReader(inputString); MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); - tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java Tue Sep 27 16:40:57 2011 +0200 @@ -1,11 +1,11 @@ -/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */ /* * Normalization rules for German text * [this is a JFlex specification] * * Wolfgang Schmidle - * version 2011-07-12 + * version 2011-08-10 * */ @@ -15,7 +15,7 @@ /** * This class is a scanner generated by * JFlex 1.4.3 - * on 03.08.11 18:24 from the specification file + * on 05.09.11 10:34 from the specification file * MpdlNormalizerLexDE.lex */ public class MpdlNormalizerLexDE { @@ -27,12 +27,13 @@ private static final int ZZ_BUFFERSIZE = 16384; /** lexical states */ - public static final int SEARCH = 6; + public static final int SEARCH = 10; + public static final int DICT_ASCII = 6; + public static final int SEARCH_ASCII = 12; public static final int DICT = 4; public static final int YYINITIAL = 0; - public static final int CELEX = 8; public static final int DISP = 2; - public static final int GRIMM = 10; + public static final int GRIMM = 8; /** * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l @@ -41,7 +42,7 @@ * l is of the form l = 2*k, k a non negative integer */ private static final int ZZ_LEXSTATE[] = { - 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6 }; /** @@ -65,12 +66,12 @@ private static final int [] ZZ_ACTION = zzUnpackAction(); private static final String ZZ_ACTION_PACKED_0 = - "\6\0\2\1\1\2\1\3\1\4\3\1\1\5\1\6"+ - "\1\3\3\1\1\7\1\10\1\11\1\12\1\13\1\14"+ + "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+ + "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+ "\1\15\1\16\1\17"; private static int [] zzUnpackAction() { - int [] result = new int[29]; + int [] result = new int[30]; int offset = 0; offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); return result; @@ -96,12 +97,12 @@ private static final String ZZ_ROWMAP_PACKED_0 = "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+ - "\0\146\0\146\0\146\0\210\0\231\0\252\0\146\0\146"+ - "\0\167\0\273\0\314\0\335\0\146\0\146\0\146\0\146"+ - "\0\146\0\146\0\146\0\146\0\146"; + "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+ + "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+ + "\0\167\0\167\0\167\0\167\0\167\0\167"; private static int [] zzUnpackRowMap() { - int [] result = new int[29]; + int [] result = new int[30]; int offset = 0; offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); return result; @@ -124,23 +125,25 @@ private static final int [] ZZ_TRANS = zzUnpackTrans(); private static final String ZZ_TRANS_PACKED_0 = - "\1\7\1\10\1\7\1\0\1\7\1\10\1\11\1\10"+ - "\1\7\1\10\6\7\1\12\1\7\1\10\1\7\1\13"+ - "\1\7\1\10\1\11\1\14\1\7\1\15\1\7\1\16"+ - "\4\7\1\12\1\7\1\10\1\7\1\17\1\7\1\10"+ - "\1\11\1\14\1\7\1\15\1\7\1\16\4\7\1\12"+ - "\1\7\1\10\1\7\1\20\1\7\1\10\1\11\1\14"+ - "\1\7\1\15\1\7\1\16\4\7\2\12\1\21\1\12"+ - "\1\17\1\7\1\10\1\11\1\22\1\12\1\23\1\12"+ - "\1\24\1\25\1\26\1\27\1\30\1\12\1\7\1\10"+ - "\1\7\1\17\1\7\1\10\1\11\1\14\1\7\1\15"+ - "\1\7\1\16\3\7\1\31\1\12\23\0\1\7\20\0"+ - "\1\7\5\0\1\32\1\0\1\33\10\0\1\7\7\0"+ - "\1\34\20\0\1\35\10\0\1\7\5\0\1\32\1\0"+ - "\1\27\10\0\1\7\7\0\1\25\20\0\1\26\6\0"; + "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+ + "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+ + "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+ + "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+ + "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+ + "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+ + "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+ + "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+ + "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+ + "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+ + "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+ + "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+ + "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+ + "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+ + "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+ + "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0"; private static int [] zzUnpackTrans() { - int [] result = new int[238]; + int [] result = new int[255]; int offset = 0; offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); return result; @@ -178,10 +181,10 @@ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final String ZZ_ATTRIBUTE_PACKED_0 = - "\6\0\1\11\1\1\3\11\3\1\2\11\4\1\11\11"; + "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11"; private static int [] zzUnpackAttribute() { - int [] result = new int[29]; + int [] result = new int[30]; int offset = 0; offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); return result; @@ -249,6 +252,8 @@ private boolean zzEOFDone; /* user code: */ + public static final int CELEX = DICT_ASCII; + private String original = ""; private String normalized = ""; private int problem = 0; @@ -558,7 +563,7 @@ zzMarkedPos = zzMarkedPosL; switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { - case 11: + case 10: { add("sz"); } case 16: break; @@ -566,7 +571,7 @@ { problem = 1; add(yytext()); } case 17: break; - case 7: + case 6: { add("ae"); } case 18: break; @@ -585,11 +590,11 @@ { add("ü"); } case 21: break; - case 9: + case 8: { add("ue"); } case 22: break; - case 6: + case 11: { switch (problem) { case 1: return original; default: return normalized.replaceAll(LB, "").toLowerCase(); @@ -608,11 +613,11 @@ { add(yytext()); } case 26: break; - case 10: + case 9: { add("ss"); } case 27: break; - case 8: + case 7: { add("oe"); } case 28: break; diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java Tue Sep 27 16:40:57 2011 +0200 @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:23 */ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ /* * Normalization rules for Greek text @@ -15,7 +15,7 @@ /** * This class is a scanner generated by * JFlex 1.4.3 - * on 03.08.11 18:23 from the specification file + * on 05.09.11 10:35 from the specification file * MpdlNormalizerLexEL.lex */ public class MpdlNormalizerLexEL { diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java Tue Sep 27 16:40:57 2011 +0200 @@ -1,11 +1,11 @@ -/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ /* * Normalization rules for French text * [this is a JFlex specification] * * Wolfgang Schmidle - * version 2011-07-12 + * version 2011-08-10 * */ @@ -15,7 +15,7 @@ /** * This class is a scanner generated by * JFlex 1.4.3 - * on 03.08.11 18:24 from the specification file + * on 05.09.11 10:35 from the specification file * MpdlNormalizerLexFR.lex */ public class MpdlNormalizerLexFR { @@ -27,10 +27,10 @@ private static final int ZZ_BUFFERSIZE = 16384; /** lexical states */ + public static final int DICT_ASCII = 8; public static final int SEARCH = 6; public static final int DICT = 4; public static final int YYINITIAL = 0; - public static final int CELEX = 8; public static final int DISP = 2; /** diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java Tue Sep 27 16:40:57 2011 +0200 @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */ +/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */ /* * Normalization rules for Latin text @@ -15,7 +15,7 @@ /** * This class is a scanner generated by * JFlex 1.4.3 - * on 21.07.11 11:22 from the specification file + * on 05.09.11 10:35 from the specification file * MpdlNormalizerLexLA.lex */ public class MpdlNormalizerLexLA { @@ -27,10 +27,10 @@ private static final int ZZ_BUFFERSIZE = 16384; /** lexical states */ - public static final int RENAISSANCE_DICT = 10; - public static final int RENAISSANCE_DISP = 8; - public static final int SEARCH = 6; - public static final int DICT = 4; + public static final int RENAISSANCE_DICT = 8; + public static final int SEARCH = 10; + public static final int RENAISSANCE_DISP = 4; + public static final int DICT = 6; public static final int YYINITIAL = 0; public static final int RENAISSANCE_SEARCH = 12; public static final int DISP = 2; @@ -42,7 +42,7 @@ * l is of the form l = 2*k, k a non negative integer */ private static final int ZZ_LEXSTATE[] = { - 0, 0, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6 + 0, 0, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6 }; /** diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java Tue Sep 27 16:40:57 2011 +0200 @@ -8,6 +8,7 @@ import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; @@ -48,9 +49,17 @@ } public void endDocument() throws SAXException { - String rootElemToStr = rootElement.toXmlString(); - write(rootElemToStr); - write("\n"); + try { + String rootElemToStr = rootElement.toXmlString(); + // hack: in echo documents the spaces between sentences should be removed + if (rootElemToStr != null && rootElemToStr.startsWith("[ \n\t]+ereignis enthüllte" is replaced by "Naturereignisenthüllte" - if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor")) + if (name.equals("lb") || name.equals("br") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor")) isWordDelimiterElement = false; return isWordDelimiterElement; } @@ -202,7 +211,7 @@ if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = composite.value; compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. - compositeValueStr = compositeValueStr.replaceAll(" +", " "); // if there are many Blanks make them to one + compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; } } else { @@ -251,7 +260,6 @@ mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY); } MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language); - tokenAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later ArrayList wordTokens = tokenAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { @@ -335,7 +343,7 @@ lexForms = lexForms + lexEntryKey + " "; } lexForms = lexForms.substring(0, lexForms.length() - 1); - lexWord = "" + displayWordDeresolved + ""; + lexWord = "" + displayWordDeresolved + ""; } else { lexWord = displayWordDeresolved; } diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Tue Sep 27 16:40:57 2011 +0200 @@ -104,7 +104,6 @@ String retStr = ""; try { MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); - tokenizerAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later ArrayList wordTokens = tokenizerAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java Tue Sep 27 16:40:57 2011 +0200 @@ -2,6 +2,11 @@ import java.util.HashMap; +/** + * + * Language codes from ISO 639-3 + * + */ public class Language { private static Language instance; private static HashMap languageIds = new HashMap(); @@ -16,15 +21,22 @@ private void init() { languageIds.put("ar", "ar"); + languageIds.put("ara", "ar"); languageIds.put("de", "de"); + languageIds.put("ger", "de"); + languageIds.put("deu", "de"); languageIds.put("el", "el"); languageIds.put("grc", "el"); languageIds.put("en", "en"); + languageIds.put("eng", "en"); languageIds.put("fr", "fr"); + languageIds.put("fra", "fr"); languageIds.put("it", "it"); + languageIds.put("ita", "it"); languageIds.put("la", "la"); languageIds.put("lat", "la"); languageIds.put("nl", "nl"); + languageIds.put("nld", "nl"); languageIds.put("zh", "zh"); languageIds.put("zho", "zh"); languageIds.put("zho-Hant", "zh"); diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java Tue Sep 27 16:40:57 2011 +0200 @@ -89,9 +89,14 @@ String language = mdRecord.getLanguage(); if (eXistIdentifier == null) throw new ApplicationException("Pdf/Html-Generation failed: no eXist-Identifier given in mdRecord"); - String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4); // without ".xml" + int lastDot = eXistIdentifier.lastIndexOf("."); + String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, lastDot); + String eXistIdentifierExtension = eXistIdentifier.substring(lastDot + 1); String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf"; String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html"; + if (eXistIdentifierExtension != null && eXistIdentifierExtension.equals("html")) { + destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-gen.html"; + } String destFileNameHtmlPdfTmp = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-4Pdf.html"; try { // start document @@ -287,8 +292,10 @@ mdRecordStr = mdRecordStr + ". " + title; if (year != null && ! year.equals("")) mdRecordStr = mdRecordStr + ". " + year + "."; - else - mdRecordStr = mdRecordStr + "."; + if (mdRecordStr.isEmpty()) { + String eXistId = mdRecord.getEXistIdentifier(); + mdRecordStr = mdRecordStr + eXistId; + } } return mdRecordStr; } @@ -383,6 +390,8 @@ pbTag = "pb"; else if (docBase != null && docBase.equals("tei")) pbTag = "TEI:pb"; + else + pbTag = "*:pb"; try { HttpClient httpClient = new HttpClient(); String requestName = "/mpdl/interface/xquery.xql?document=" + docName + "&xquery=count(//" + pbTag + ")"; diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java Tue Sep 27 16:40:57 2011 +0200 @@ -13,11 +13,14 @@ import java.util.ArrayList; import java.util.Date; import java.util.Iterator; +import java.util.Properties; import javax.xml.XMLConstants; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Source; import javax.xml.transform.Transformer; @@ -25,8 +28,10 @@ import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.sax.SAXResult; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import javax.xml.validation.Validator; @@ -45,6 +50,7 @@ import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; @@ -140,6 +146,17 @@ return root; } + public void parse(File xmlFile) throws ApplicationException { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + DefaultHandler dh = new DefaultHandler(); + saxParser.parse(xmlFile, dh); + } catch (Exception e) { + throw new ApplicationException(e); + } + } + public void validateByRelaxNG(File xmlFile, URL schemaUrl) throws ApplicationException { System.setProperty(SchemaFactory.class.getName() + ":" + XMLConstants.RELAXNG_NS_URI, "com.thaiopensource.relaxng.jaxp.CompactSyntaxSchemaFactory"); SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.RELAXNG_NS_URI); @@ -649,4 +666,54 @@ } return xmlString; } + + public String transform(String xmlString, String xslFileName, Properties outputProperties) throws ApplicationException { + String resultString = null; + try { + StreamSource xslSource = new StreamSource(xslFileName); + Transformer transformer = TransformerFactory.newInstance(net.sf.saxon.TransformerFactoryImpl.class.getName(), null).newTransformer(xslSource); + if (outputProperties != null) { + String propValue = outputProperties.getProperty("method"); + if (propValue != null) + transformer.setOutputProperty(OutputKeys.METHOD, propValue); + propValue = outputProperties.getProperty("indent"); + if (propValue != null) + transformer.setOutputProperty(OutputKeys.INDENT, propValue); + propValue = outputProperties.getProperty("media-type"); + if (propValue != null) + transformer.setOutputProperty(OutputKeys.MEDIA_TYPE, propValue); + propValue = outputProperties.getProperty("omit-xml-declaration"); + if (propValue != null) + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, propValue); + propValue = outputProperties.getProperty("encoding"); + if (propValue != null) + transformer.setOutputProperty(OutputKeys.ENCODING, propValue); + } + StreamResult result = new StreamResult(new StringWriter()); + StreamSource source = new StreamSource(new StringReader(xmlString)); + transformer.transform(source, result); + resultString = result.getWriter().toString(); + } catch (TransformerConfigurationException e) { + throw new ApplicationException(e); + } catch (TransformerException e) { + throw new ApplicationException(e); + } + return resultString; + } + + public SAXResult transformToSaxResult(String xmlString, String xslString, Properties outputProperties) throws ApplicationException { + SAXResult result = new SAXResult(); + try { + StreamSource xslSource = new StreamSource(new StringReader(xslString)); + Transformer transformer = TransformerFactory.newInstance().newTransformer(xslSource); + transformer.setOutputProperties(outputProperties); + StreamSource source = new StreamSource(new StringReader(xmlString)); + transformer.transform(source, result); + } catch (TransformerConfigurationException e) { + throw new ApplicationException(e); + } catch (TransformerException e) { + throw new ApplicationException(e); + } + return result; + } } diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java --- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java Tue Sep 27 16:40:57 2011 +0200 @@ -35,7 +35,7 @@ String fName = docOperation.getFileName(); if (fName == null || fName.trim().equals("")) throw new ApplicationException("Your document file name is empty. Please specify a file name for your document."); - if (! fName.endsWith(".xml")) + if (! fName.endsWith(".xml") && docBase != null && ! docBase.equals("diverse")) throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document."); // RelaxNG schema validation validateByRelaxNGSchema(destFile, docBase); @@ -75,6 +75,14 @@ String id = getIdByExistId(eXistIdentifier); mdRecord.setIdentifier("TEI:" + id + ".xml"); } + } else if (docBase != null && docBase.equals("diverse")) { + mdRecord = getMetadataRecordDiverse(documentNode); + if (mdRecord != null) { + String id = getIdByExistId(eXistIdentifier); + mdRecord.setIdentifier(id); + String lang = docOperation.getLanguage(); + mdRecord.setLanguage(lang); + } } if (mdRecord != null) { mdRecord.setEXistIdentifier(eXistIdentifier); @@ -132,6 +140,8 @@ } private void validate(Node docNode, String docBase) throws ApplicationException { + if (docBase.equals("diverse")) + return; XmlUtil xmlUtil = XmlUtil.getInstance(); NamespaceContext nsContext = getEchoNsContext(); String echoTest = null; @@ -192,6 +202,9 @@ } private void validate(MetadataRecord mdRecord) throws ApplicationException { + String docBase = mdRecord.getDocBase(); + if (docBase.equals("diverse")) + return; String identifier = mdRecord.getIdentifier(); String creator = mdRecord.getCreator(); String title = mdRecord.getTitle(); @@ -318,6 +331,18 @@ return mdRecord; } + private MetadataRecord getMetadataRecordDiverse(Node documentNode) throws ApplicationException { + String rights = "open access"; + String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration"; + String accessRights = "free"; + accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights); + MetadataRecord mdRecord = new MetadataRecord(null, null, null, null, null, null, null, rights, null); + mdRecord.setDocBase("diverse"); + mdRecord.setLicense(license); + mdRecord.setAccessRights(accessRights); + return mdRecord; + } + private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException { String resultStr = null; String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter"; diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java --- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java Mon Aug 29 17:40:19 2011 +0200 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java Tue Sep 27 16:40:57 2011 +0200 @@ -39,6 +39,7 @@ new FunctionDef(GetESciDocs.signature, GetESciDocs.class), new FunctionDef(GetJobs.signature, GetJobs.class), new FunctionDef(GetESciDocContainerIdByExistId.signature, GetESciDocContainerIdByExistId.class), + new FunctionDef(Transform.signature, Transform.class), new FunctionDef(Html2Pdf.signature, Html2Pdf.class) }; diff -r e99964f390e4 -r 257f67be5c00 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/Transform.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/Transform.java Tue Sep 27 16:40:57 2011 +0200 @@ -0,0 +1,91 @@ +/* + * eXist Open Source Native XML Database: Extension module + * Copyright (C) 2008 Josef Willenborg + * jwillenborg@mpiwg-berlin.mpg.de + * http://www.mpiwg-berlin.mpg.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * $Id: $ + */ +package org.exist.xquery.modules.mpdldoc; + +import java.util.Properties; + +import org.exist.dom.QName; +import org.exist.xquery.BasicFunction; +import org.exist.xquery.Cardinality; +import org.exist.xquery.FunctionSignature; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.SequenceType; +import org.exist.xquery.value.StringValue; +import org.exist.xquery.value.Type; +import org.exist.xquery.value.ValueSequence; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil; + +/** + * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de) + */ +public class Transform extends BasicFunction { + + public final static FunctionSignature signature = + new FunctionSignature( + new QName("transform", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX), + "A function which transforms the input xml string by the xsl stylesheet given as xslFileName and outputs it as a string.", + new SequenceType[] { + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE), + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) + }, + new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)); + + public Transform(XQueryContext context) { + super(context, signature); + } + + public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { + try { + Sequence firstSeq = args[0]; + Sequence secondSeq = args[1]; + Sequence thirdSeq = args[2]; + if (firstSeq.isEmpty() || secondSeq.isEmpty()) + return Sequence.EMPTY_SEQUENCE; + String xmlStr = firstSeq.getStringValue(); + String xslFileName = secondSeq.getStringValue(); + String outputPropertiesStr = thirdSeq.getStringValue(); + Properties outputProperties = new Properties(); + if (outputPropertiesStr != null && ! outputPropertiesStr.equals("")) { + String[] outputProps = outputPropertiesStr.split(" "); + for (int i=0; i