Mercurial > hg > mpdl-group

Binary file software/eXist/mpdl-modules/src/de/.DS_Store has changed
Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java	Tue Sep 27 16:40:57 2011 +0200
@@ -44,7 +44,7 @@
   private MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler;
   private ESciDocIngestor eSciDocIngestor;

-  private String[] docBases = {"archimedes", "echo", "tei"};
+  private String[] docBases = {"archimedes", "diverse", "echo", "tei"};
   private String[] languages = {"ar", "de", "el", "en", "fr", "it", "la", "nl", "zh"};
   private String documentRootCollectionMorph = "/db/mpdl/documents/morph";
   private String documentRootCollectionStandard = "/db/mpdl/documents/standard";
@@ -92,7 +92,7 @@

     deleteDocumentCollections();
     createDocumentCollections();
-    saveDocumentFiles();
+    // saveDocumentFiles();
     endOperation();
     System.out.println("The DocumentHandler needed: " + (endOfOperation - beginOfOperation) + " ms" );
   }
@@ -204,10 +204,13 @@
       //  validation
       docOperation.setStatus("validate document: " + eXistIdentifier);
       schemaHandler.validate(destFileName, docOperation);
-      // save regularizations of the document
-      docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server");
-      RegularizationManager regManager = RegularizationManager.getInstance();
-      regManager.saveRegularizations(language, destFileName);
+      String docBase = docOperation.getDocBase();
+      if (docBase != null && docBase.equals("echo")) {
+        // save regularizations of the document
+        docOperation.setStatus(operationName + " regularizations of document: " + eXistIdentifier + " on eXist server");
+        RegularizationManager regManager = RegularizationManager.getInstance();
+        regManager.saveRegularizations(language, destFileName);
+      }
       // perform operation on eXist
       docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server");
       mpdlXmlRpcDocHandler.saveDocumentFile(docOperation);
@@ -228,9 +231,10 @@
     String operationName = docOperation.getName();
     String eXistIdentifier = docOperation.getDestUrl();
     String fileName = docOperation.getFileName();
+    String docBase = docOperation.getDocBase();
     if (fileName == null || fileName.trim().equals(""))
       throw new ApplicationException("Your document file name is empty. Please specify a file name for your document.");
-    if (! fileName.endsWith(".xml"))
+    if (! fileName.endsWith(".xml") && docBase != null &&  ! docBase.equals("diverse"))
       throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document.");
     boolean docExists = mpdlXmlRpcDocHandler.documentExists(docOperation);
     if (! docExists)
@@ -238,13 +242,18 @@
     // perform operation
     docOperation.setStatus(operationName + " document: " + eXistIdentifier + " on eXist server");
     // delete file on local file system: xml, pdf and html
-    String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4);  // without ".xml"
+    int lastDot = eXistIdentifier.lastIndexOf(".");
+    String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, lastDot);
     String destFileNameXml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifier;
     FileUtil.getInstance().deleteFile(destFileNameXml);
     boolean includePdf = docOperation.includePdf();
     if (includePdf) {
+      String eXistIdentifierExtension = eXistIdentifier.substring(lastDot + 1);
       String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf";
       String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html";
+      if (eXistIdentifierExtension != null && eXistIdentifierExtension.equals("html")) {
+        destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-gen.html";
+      }
       FileUtil.getInstance().deleteFile(destFileNamePdf);
       FileUtil.getInstance().deleteFile(destFileNameHtml);
     }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/doc/GetDocServlet.java	Tue Sep 27 16:40:57 2011 +0200
@@ -20,6 +20,10 @@

   protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
     String docEXistIdentifier = request.getParameter("doc");
+    if (docEXistIdentifier == null || docEXistIdentifier.isEmpty()) {
+      write(response, "Parameter: \"doc\" is not set. Please set parameter \"doc\".");
+      return;
+    }
     String docFileName = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + docEXistIdentifier;
     File docFile = new File(docFileName);
     if (docFile.exists())
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Tue Sep 27 16:40:57 2011 +0200
@@ -301,7 +301,7 @@
         if (mode == DISPLAY)
           mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
         else if (mode == DICTIONARY)
-          mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.CELEX);
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII);
         else if (mode == SEARCH)
           mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH);
         while (token != null) {
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Sep 27 16:40:57 2011 +0200
@@ -12,7 +12,6 @@
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 1024;
   private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
-  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
   private boolean isInNotWordDelimMode = false;
   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private char[] buffer = new char[MAX_WORD_LEN];
@@ -31,22 +30,12 @@
     this.normalizer = normalizer;
   }

-  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
-    this.regWithoutSemicolon = regWithoutSemicolon;
-  }
-
-  public boolean isRegWithoutSemicolon() {
-    return regWithoutSemicolon;
-  }
-
   /** Returns true iff a character should be included in a token.  This
    * tokenizer generates as tokens adjacent sequences of characters which
    * satisfy this predicate.  Characters for which this is false are used to
    * define token boundaries and are not included in tokens. */
   protected boolean isTokenChar(char c) {
     boolean isTokenChar = true;
-    if (isRegWithoutSemicolon() && c == ';')  // hack: special case for regularization and normalization; feel free to remove it later
-      return true;
     switch (c) {
       case ' ': isTokenChar = false; break;
       case '.': isTokenChar = false; break;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java	Tue Sep 27 16:40:57 2011 +0200
@@ -16,7 +16,6 @@
 public class MpdlTokenizerAnalyzer extends Analyzer {
   protected String language = MpdlConstants.DEFAULT_LANGUAGE;
   protected MpdlNormalizer normalizer = null;
-  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon

   public MpdlTokenizerAnalyzer(String language) {
     this.language = language;
@@ -28,17 +27,8 @@
     this.normalizer = normalizer;
   }

-  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
-    this.regWithoutSemicolon = regWithoutSemicolon;
-  }
-
-  public boolean isRegWithoutSemicolon() {
-    return regWithoutSemicolon;
-  }
-
   public TokenStream tokenStream(String fieldName, Reader reader) {
     MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
-    tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
     TokenStream result = (TokenStream) tmpTokenizer;
     result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
     result = new LowerCaseFilter(result);
@@ -50,7 +40,6 @@
     try {
       Reader reader = new StringReader(inputString);
       MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
-      tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon);  // hack: feel free to remove it later
       TokenStream result = (TokenStream) tmpTokenizer;
       result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
       result = new LowerCaseFilter(result);
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java	Tue Sep 27 16:40:57 2011 +0200
@@ -1,11 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */

 /*
  * Normalization rules for German text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle
- * version 2011-07-12
+ * version 2011-08-10
  *
  */

@@ -15,7 +15,7 @@
 /**
  * This class is a scanner generated by
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 03.08.11 18:24 from the specification file
+ * on 05.09.11 10:34 from the specification file
  * <tt>MpdlNormalizerLexDE.lex</tt>
  */
 public class MpdlNormalizerLexDE {
@@ -27,12 +27,13 @@
   private static final int ZZ_BUFFERSIZE = 16384;

   /** lexical states */
-  public static final int SEARCH = 6;
+  public static final int SEARCH = 10;
+  public static final int DICT_ASCII = 6;
+  public static final int SEARCH_ASCII = 12;
   public static final int DICT = 4;
   public static final int YYINITIAL = 0;
-  public static final int CELEX = 8;
   public static final int DISP = 2;
-  public static final int GRIMM = 10;
+  public static final int GRIMM = 8;

   /**
    * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
@@ -41,7 +42,7 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = {
-     0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5, 5
+     0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6, 6
   };

   /**
@@ -65,12 +66,12 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();

   private static final String ZZ_ACTION_PACKED_0 =
-    "\6\0\2\1\1\2\1\3\1\4\3\1\1\5\1\6"+
-    "\1\3\3\1\1\7\1\10\1\11\1\12\1\13\1\14"+
+    "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+
+    "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+
     "\1\15\1\16\1\17";

   private static int [] zzUnpackAction() {
-    int [] result = new int[29];
+    int [] result = new int[30];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -96,12 +97,12 @@

   private static final String ZZ_ROWMAP_PACKED_0 =
     "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+
-    "\0\146\0\146\0\146\0\210\0\231\0\252\0\146\0\146"+
-    "\0\167\0\273\0\314\0\335\0\146\0\146\0\146\0\146"+
-    "\0\146\0\146\0\146\0\146\0\146";
+    "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+
+    "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+
+    "\0\167\0\167\0\167\0\167\0\167\0\167";

   private static int [] zzUnpackRowMap() {
-    int [] result = new int[29];
+    int [] result = new int[30];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -124,23 +125,25 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();

   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\7\1\10\1\7\1\0\1\7\1\10\1\11\1\10"+
-    "\1\7\1\10\6\7\1\12\1\7\1\10\1\7\1\13"+
-    "\1\7\1\10\1\11\1\14\1\7\1\15\1\7\1\16"+
-    "\4\7\1\12\1\7\1\10\1\7\1\17\1\7\1\10"+
-    "\1\11\1\14\1\7\1\15\1\7\1\16\4\7\1\12"+
-    "\1\7\1\10\1\7\1\20\1\7\1\10\1\11\1\14"+
-    "\1\7\1\15\1\7\1\16\4\7\2\12\1\21\1\12"+
-    "\1\17\1\7\1\10\1\11\1\22\1\12\1\23\1\12"+
-    "\1\24\1\25\1\26\1\27\1\30\1\12\1\7\1\10"+
-    "\1\7\1\17\1\7\1\10\1\11\1\14\1\7\1\15"+
-    "\1\7\1\16\3\7\1\31\1\12\23\0\1\7\20\0"+
-    "\1\7\5\0\1\32\1\0\1\33\10\0\1\7\7\0"+
-    "\1\34\20\0\1\35\10\0\1\7\5\0\1\32\1\0"+
-    "\1\27\10\0\1\7\7\0\1\25\20\0\1\26\6\0";
+    "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+
+    "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+
+    "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+
+    "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+
+    "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+
+    "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+
+    "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+
+    "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+
+    "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+
+    "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+
+    "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+
+    "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+
+    "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+
+    "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+
+    "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+
+    "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0";

   private static int [] zzUnpackTrans() {
-    int [] result = new int[238];
+    int [] result = new int[255];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -178,10 +181,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();

   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\6\0\1\11\1\1\3\11\3\1\2\11\4\1\11\11";
+    "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11";

   private static int [] zzUnpackAttribute() {
-    int [] result = new int[29];
+    int [] result = new int[30];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -249,6 +252,8 @@
   private boolean zzEOFDone;

   /* user code: */
+	public static final int CELEX = DICT_ASCII;
+
 	private String original = "";
 	private String normalized = "";
 	private int problem = 0;
@@ -558,7 +563,7 @@
       zzMarkedPos = zzMarkedPosL;

       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 11:
+        case 10:
           { add("sz");
           }
         case 16: break;
@@ -566,7 +571,7 @@
           { problem = 1; add(yytext());
           }
         case 17: break;
-        case 7:
+        case 6:
           { add("ae");
           }
         case 18: break;
@@ -585,11 +590,11 @@
           { add("ü");
           }
         case 21: break;
-        case 9:
+        case 8:
           { add("ue");
           }
         case 22: break;
-        case 6:
+        case 11:
           { switch (problem) {
 			case 1: return original;
 			default: return normalized.replaceAll(LB, "").toLowerCase();
@@ -608,11 +613,11 @@
           { add(yytext());
           }
         case 26: break;
-        case 10:
+        case 9:
           { add("ss");
           }
         case 27: break;
-        case 8:
+        case 7:
           { add("oe");
           }
         case 28: break;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java	Tue Sep 27 16:40:57 2011 +0200
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:23 */
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */

 /*
  * Normalization rules for Greek text
@@ -15,7 +15,7 @@
 /**
  * This class is a scanner generated by
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 03.08.11 18:23 from the specification file
+ * on 05.09.11 10:35 from the specification file
  * <tt>MpdlNormalizerLexEL.lex</tt>
  */
 public class MpdlNormalizerLexEL {
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java	Tue Sep 27 16:40:57 2011 +0200
@@ -1,11 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */

 /*
  * Normalization rules for French text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle
- * version 2011-07-12
+ * version 2011-08-10
  *
  */

@@ -15,7 +15,7 @@
 /**
  * This class is a scanner generated by
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 03.08.11 18:24 from the specification file
+ * on 05.09.11 10:35 from the specification file
  * <tt>MpdlNormalizerLexFR.lex</tt>
  */
 public class MpdlNormalizerLexFR {
@@ -27,10 +27,10 @@
   private static final int ZZ_BUFFERSIZE = 16384;

   /** lexical states */
+  public static final int DICT_ASCII = 8;
   public static final int SEARCH = 6;
   public static final int DICT = 4;
   public static final int YYINITIAL = 0;
-  public static final int CELEX = 8;
   public static final int DISP = 2;

   /**
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java	Tue Sep 27 16:40:57 2011 +0200
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */

 /*
  * Normalization rules for Latin text
@@ -15,7 +15,7 @@
 /**
  * This class is a scanner generated by
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 21.07.11 11:22 from the specification file
+ * on 05.09.11 10:35 from the specification file
  * <tt>MpdlNormalizerLexLA.lex</tt>
  */
 public class MpdlNormalizerLexLA {
@@ -27,10 +27,10 @@
   private static final int ZZ_BUFFERSIZE = 16384;

   /** lexical states */
-  public static final int RENAISSANCE_DICT = 10;
-  public static final int RENAISSANCE_DISP = 8;
-  public static final int SEARCH = 6;
-  public static final int DICT = 4;
+  public static final int RENAISSANCE_DICT = 8;
+  public static final int SEARCH = 10;
+  public static final int RENAISSANCE_DISP = 4;
+  public static final int DICT = 6;
   public static final int YYINITIAL = 0;
   public static final int RENAISSANCE_SEARCH = 12;
   public static final int DISP = 2;
@@ -42,7 +42,7 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = {
-     0,  0,  1,  2,  3,  4,  5,  6,  1,  2,  3,  4,  5, 6
+     0,  0,  1,  2,  1,  2,  3,  4,  3,  4,  5,  6,  5, 6
   };

   /**
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java	Tue Sep 27 16:40:57 2011 +0200
@@ -8,6 +8,7 @@
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;

@@ -48,9 +49,17 @@
   }

   public void endDocument() throws SAXException {
-    String rootElemToStr = rootElement.toXmlString();
-    write(rootElemToStr);
-    write("\n");
+    try {
+      String rootElemToStr = rootElement.toXmlString();
+      // hack: in echo documents the spaces between sentences should be removed
+      if (rootElemToStr != null && rootElemToStr.startsWith("<echo") && Language.getInstance().isChinese(language)) {
+        rootElemToStr = rootElemToStr.replaceAll("</s>[ \n\t]+<s", "</s><s");
+      }
+      write(rootElemToStr);
+      write("\n");
+    } catch (NullPointerException e) {
+      throw new SAXException(e);
+    }
   }

   public void characters(char[] c, int start, int length) throws SAXException {
@@ -173,7 +182,7 @@
       boolean isWordDelimiterElement = true;
       // "note" causes problems: word after the note is not recognized
       // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte"
-      if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor"))
+      if (name.equals("lb") || name.equals("br") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor"))
         isWordDelimiterElement = false;
       return isWordDelimiterElement;
     }
@@ -202,7 +211,7 @@
               if (composite.value != null && ! composite.value.equals("")) {
                 String compositeValueStr = composite.value;
                 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
-                compositeValueStr = compositeValueStr.replaceAll(" +", " "); // if there are many Blanks make them to one
+                compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one
                 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
               }
             } else {
@@ -251,7 +260,6 @@
           mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
         }
         MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language);
-        tokenAnalyzer.setRegWithoutSemicolon(true);  // hack: feel free to remove it later
         ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr);
         int endPos = 0;
         for (int i=0; i < wordTokens.size(); i++) {
@@ -335,7 +343,7 @@
           lexForms = lexForms + lexEntryKey + " ";
         }
         lexForms = lexForms.substring(0, lexForms.length() - 1);
-        lexWord = "<w lang=\"" + language + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>";
+        lexWord = "<w lang=\"" + lang + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>";
       } else {
         lexWord = displayWordDeresolved;
       }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Sep 27 16:40:57 2011 +0200
@@ -104,7 +104,6 @@
     String retStr = "";
     try {
       MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language);
-      tokenizerAnalyzer.setRegWithoutSemicolon(true);  // hack: feel free to remove it later
       ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr);
       int endPos = 0;
       for (int i=0; i < wordTokens.size(); i++) {
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Tue Sep 27 16:40:57 2011 +0200
@@ -2,6 +2,11 @@

 import java.util.HashMap;

+/**
+ *
+ * Language codes from ISO 639-3
+ *
+ */
 public class Language {
   private static Language instance;
   private static HashMap<String, String> languageIds = new HashMap<String, String>();
@@ -16,15 +21,22 @@

   private void init() {
     languageIds.put("ar", "ar");
+    languageIds.put("ara", "ar");
     languageIds.put("de", "de");
+    languageIds.put("ger", "de");
+    languageIds.put("deu", "de");
     languageIds.put("el", "el");
     languageIds.put("grc", "el");
     languageIds.put("en", "en");
+    languageIds.put("eng", "en");
     languageIds.put("fr", "fr");
+    languageIds.put("fra", "fr");
     languageIds.put("it", "it");
+    languageIds.put("ita", "it");
     languageIds.put("la", "la");
     languageIds.put("lat", "la");
     languageIds.put("nl", "nl");
+    languageIds.put("nld", "nl");
     languageIds.put("zh", "zh");
     languageIds.put("zho", "zh");
     languageIds.put("zho-Hant", "zh");
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/MpdlITextRenderer.java	Tue Sep 27 16:40:57 2011 +0200
@@ -89,9 +89,14 @@
     String language = mdRecord.getLanguage();
     if (eXistIdentifier == null)
       throw new ApplicationException("Pdf/Html-Generation failed: no eXist-Identifier given in mdRecord");
-    String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, eXistIdentifier.length() - 4);  // without ".xml"
+    int lastDot = eXistIdentifier.lastIndexOf(".");
+    String eXistIdentifierWithoutExtension = eXistIdentifier.substring(0, lastDot);
+    String eXistIdentifierExtension = eXistIdentifier.substring(lastDot + 1);
     String destFileNamePdf = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".pdf";
     String destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + ".html";
+    if (eXistIdentifierExtension != null && eXistIdentifierExtension.equals("html")) {
+      destFileNameHtml = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-gen.html";
+    }
     String destFileNameHtmlPdfTmp = MpdlConstants.MPDL_EXIST_DATA_DIR + "/documents" + eXistIdentifierWithoutExtension + "-4Pdf.html";
     try {
       // start document
@@ -287,8 +292,10 @@
         mdRecordStr = mdRecordStr + ". " + title;
       if (year != null && ! year.equals(""))
         mdRecordStr = mdRecordStr + ". " + year + ".";
-      else
-        mdRecordStr = mdRecordStr + ".";
+      if (mdRecordStr.isEmpty()) {
+        String eXistId = mdRecord.getEXistIdentifier();
+        mdRecordStr = mdRecordStr + eXistId;
+      }
     }
     return mdRecordStr;
   }
@@ -383,6 +390,8 @@
       pbTag = "pb";
     else if (docBase != null && docBase.equals("tei"))
       pbTag = "TEI:pb";
+    else
+      pbTag = "*:pb";
     try {
       HttpClient httpClient = new HttpClient();
       String requestName = "/mpdl/interface/xquery.xql?document=" + docName + "&xquery=count(//" + pbTag + ")";
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Tue Sep 27 16:40:57 2011 +0200
@@ -13,11 +13,14 @@
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Iterator;
+import java.util.Properties;

 import javax.xml.XMLConstants;
 import javax.xml.namespace.NamespaceContext;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Source;
 import javax.xml.transform.Transformer;
@@ -25,8 +28,10 @@
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.sax.SAXResult;
 import javax.xml.transform.sax.SAXSource;
 import javax.xml.transform.stream.StreamResult;
+import javax.xml.transform.stream.StreamSource;
 import javax.xml.validation.Schema;
 import javax.xml.validation.SchemaFactory;
 import javax.xml.validation.Validator;
@@ -45,6 +50,7 @@
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;

 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

@@ -140,6 +146,17 @@
     return root;
   }

+  public void parse(File xmlFile) throws ApplicationException {
+    try {
+      SAXParserFactory factory = SAXParserFactory.newInstance();
+      SAXParser saxParser = factory.newSAXParser();
+      DefaultHandler dh = new DefaultHandler();
+      saxParser.parse(xmlFile, dh);
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+  }
+
   public void validateByRelaxNG(File xmlFile, URL schemaUrl) throws ApplicationException {
     System.setProperty(SchemaFactory.class.getName() + ":" + XMLConstants.RELAXNG_NS_URI, "com.thaiopensource.relaxng.jaxp.CompactSyntaxSchemaFactory");
     SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.RELAXNG_NS_URI);
@@ -649,4 +666,54 @@
     }
     return xmlString;
   }
+
+  public String transform(String xmlString, String xslFileName, Properties outputProperties) throws ApplicationException {
+    String resultString = null;
+    try {
+      StreamSource xslSource = new StreamSource(xslFileName);
+      Transformer transformer = TransformerFactory.newInstance(net.sf.saxon.TransformerFactoryImpl.class.getName(), null).newTransformer(xslSource);
+      if (outputProperties != null) {
+        String propValue = outputProperties.getProperty("method");
+        if (propValue != null)
+          transformer.setOutputProperty(OutputKeys.METHOD, propValue);
+        propValue = outputProperties.getProperty("indent");
+        if (propValue != null)
+          transformer.setOutputProperty(OutputKeys.INDENT, propValue);
+        propValue = outputProperties.getProperty("media-type");
+        if (propValue != null)
+          transformer.setOutputProperty(OutputKeys.MEDIA_TYPE, propValue);
+        propValue = outputProperties.getProperty("omit-xml-declaration");
+        if (propValue != null)
+          transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, propValue);
+        propValue = outputProperties.getProperty("encoding");
+        if (propValue != null)
+          transformer.setOutputProperty(OutputKeys.ENCODING, propValue);
+      }
+      StreamResult result = new StreamResult(new StringWriter());
+      StreamSource source = new StreamSource(new StringReader(xmlString));
+      transformer.transform(source, result);
+      resultString = result.getWriter().toString();
+    } catch (TransformerConfigurationException e) {
+      throw new ApplicationException(e);
+    } catch (TransformerException e) {
+      throw new ApplicationException(e);
+    }
+    return resultString;
+  }
+
+  public SAXResult transformToSaxResult(String xmlString, String xslString, Properties outputProperties) throws ApplicationException {
+    SAXResult result = new SAXResult();
+    try {
+      StreamSource xslSource = new StreamSource(new StringReader(xslString));
+      Transformer transformer = TransformerFactory.newInstance().newTransformer(xslSource);
+      transformer.setOutputProperties(outputProperties);
+      StreamSource source = new StreamSource(new StringReader(xmlString));
+      transformer.transform(source, result);
+    } catch (TransformerConfigurationException e) {
+      throw new ApplicationException(e);
+    } catch (TransformerException e) {
+      throw new ApplicationException(e);
+    }
+    return result;
+  }
 }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/xml/SchemaHandler.java	Tue Sep 27 16:40:57 2011 +0200
@@ -35,7 +35,7 @@
     String fName = docOperation.getFileName();
     if (fName == null || fName.trim().equals(""))
       throw new ApplicationException("Your document file name is empty. Please specify a file name for your document.");
-    if (! fName.endsWith(".xml"))
+    if (! fName.endsWith(".xml")  && docBase != null &&  ! docBase.equals("diverse"))
       throw new ApplicationException("Your document file name does not end with \".xml\". Please specify a file name with the suffix \".xml\" for your document.");
     // RelaxNG schema validation
     validateByRelaxNGSchema(destFile, docBase);
@@ -75,6 +75,14 @@
         String id = getIdByExistId(eXistIdentifier);
         mdRecord.setIdentifier("TEI:" + id + ".xml");
       }
+    } else if (docBase != null && docBase.equals("diverse")) {
+      mdRecord = getMetadataRecordDiverse(documentNode);
+      if (mdRecord != null) {
+        String id = getIdByExistId(eXistIdentifier);
+        mdRecord.setIdentifier(id);
+        String lang = docOperation.getLanguage();
+        mdRecord.setLanguage(lang);
+      }
     }
     if (mdRecord != null) {
       mdRecord.setEXistIdentifier(eXistIdentifier);
@@ -132,6 +140,8 @@
   }

   private void validate(Node docNode, String docBase) throws ApplicationException {
+    if (docBase.equals("diverse"))
+      return;
     XmlUtil xmlUtil = XmlUtil.getInstance();
     NamespaceContext nsContext = getEchoNsContext();
     String echoTest = null;
@@ -192,6 +202,9 @@
   }

   private void validate(MetadataRecord mdRecord) throws ApplicationException {
+    String docBase = mdRecord.getDocBase();
+    if (docBase.equals("diverse"))
+      return;
     String identifier = mdRecord.getIdentifier();
     String creator = mdRecord.getCreator();
     String title = mdRecord.getTitle();
@@ -318,6 +331,18 @@
     return mdRecord;
   }

+  private MetadataRecord getMetadataRecordDiverse(Node documentNode) throws ApplicationException {
+    String rights = "open access";
+    String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
+    String accessRights = "free";
+    accessRights = StringUtilEscapeChars.deresolveXmlEntities(accessRights);
+    MetadataRecord mdRecord = new MetadataRecord(null, null, null, null, null, null, null, rights, null);
+    mdRecord.setDocBase("diverse");
+    mdRecord.setLicense(license);
+    mdRecord.setAccessRights(accessRights);
+    return mdRecord;
+  }
+
   private String getIndexMetaDataPageImg(String imagesDocDirectory) throws ApplicationException {
     String resultStr = null;
     String nausikaaURLTexter = "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter";
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java	Mon Aug 29 17:40:19 2011 +0200
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/MPDLDocModule.java	Tue Sep 27 16:40:57 2011 +0200
@@ -39,6 +39,7 @@
     new FunctionDef(GetESciDocs.signature, GetESciDocs.class),
     new FunctionDef(GetJobs.signature, GetJobs.class),
     new FunctionDef(GetESciDocContainerIdByExistId.signature, GetESciDocContainerIdByExistId.class),
+    new FunctionDef(Transform.signature, Transform.class),
     new FunctionDef(Html2Pdf.signature, Html2Pdf.class)
 	};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdldoc/Transform.java	Tue Sep 27 16:40:57 2011 +0200
@@ -0,0 +1,91 @@
+/*
+ *  eXist Open Source Native XML Database: Extension module
+ *  Copyright (C) 2008 Josef Willenborg
+ *  jwillenborg@mpiwg-berlin.mpg.de
+ *  http://www.mpiwg-berlin.mpg.de
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  $Id:  $
+ */
+package org.exist.xquery.modules.mpdldoc;
+
+import java.util.Properties;
+
+import org.exist.dom.QName;
+import org.exist.xquery.BasicFunction;
+import org.exist.xquery.Cardinality;
+import org.exist.xquery.FunctionSignature;
+import org.exist.xquery.XPathException;
+import org.exist.xquery.XQueryContext;
+import org.exist.xquery.value.Sequence;
+import org.exist.xquery.value.SequenceType;
+import org.exist.xquery.value.StringValue;
+import org.exist.xquery.value.Type;
+import org.exist.xquery.value.ValueSequence;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
+
+/**
+ * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de)
+ */
+public class Transform extends BasicFunction {
+
+	public final static FunctionSignature signature =
+		new FunctionSignature(
+			new QName("transform", MPDLDocModule.NAMESPACE_URI, MPDLDocModule.PREFIX),
+			"A function which transforms the input xml string by the xsl stylesheet given as xslFileName and outputs it as a string.",
+			new SequenceType[] {
+        new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+        new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+        new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE)
+			  },
+			new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE));
+
+	public Transform(XQueryContext context) {
+		super(context, signature);
+	}
+
+  public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
+    try {
+      Sequence firstSeq = args[0];
+      Sequence secondSeq = args[1];
+      Sequence thirdSeq = args[2];
+      if (firstSeq.isEmpty() || secondSeq.isEmpty())
+        return Sequence.EMPTY_SEQUENCE;
+      String xmlStr = firstSeq.getStringValue();
+      String xslFileName = secondSeq.getStringValue();
+      String outputPropertiesStr = thirdSeq.getStringValue();
+      Properties outputProperties = new Properties();
+      if (outputPropertiesStr != null && ! outputPropertiesStr.equals("")) {
+        String[] outputProps = outputPropertiesStr.split(" ");
+        for (int i=0; i<outputProps.length; i++) {
+          String prop = outputProps[i];
+          int index = prop.indexOf("=");
+          String key = prop.substring(0, index);
+          String value = prop.substring(index + 1);
+          outputProperties.setProperty(key, value);
+        }
+      }
+      String resultStr = XmlUtil.getInstance().transform(xmlStr, xslFileName, outputProperties);
+      ValueSequence resultSequence = new ValueSequence();
+      resultSequence.add(new StringValue(resultStr));
+      return resultSequence;
+    } catch (ApplicationException e) {
+      throw new XPathException(e.getMessage());
+    }
+  }
+}