# HG changeset patch
# User Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
# Date 1314632402 -7200
# Node ID 5df60f24e9977ba8d1f1883eee95b292d3928b91
# Parent  469d927b9ca728fd6d026e8c1c68f2c3b26fcc37
diverse Fehlerbehebungen

diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/.DS_Store
Binary file software/eXist/mpdl-modules/src/de/mpg/.DS_Store has changed
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/.DS_Store
Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/.DS_Store has changed
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/.DS_Store
Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/.DS_Store has changed
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store
Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/.DS_Store has changed
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java	Mon Aug 29 17:40:02 2011 +0200
@@ -61,11 +61,11 @@
       TestESciDoc test = new TestESciDoc();
       test.init("jwillenborg");  // init eSciDoc-Session with cookie as user jwillenborg
       
-      // test.grant("urte", "admin");
-      String uid = test.getUserId("urte");
+      // test.grant("schoepfl", "admin");
+      String uid = test.getUserId("schoepfl");
       String users = test.getAllUsers();
-      String grantAdmin = test.getGrantHrefByUserNameAndRoleName("urte", "escidoc:role-system-administrator");
-      String grants = test.getGrantsByUserName("urte");
+      String grantAdmin = test.getGrantHrefByUserNameAndRoleName("schoepfl", "escidoc:role-system-administrator");
+      String grants = test.getGrantsByUserName("schoepfl");
       String bla = "";
 
       // test.testSchemaValidation();
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Mon Aug 29 17:40:02 2011 +0200
@@ -25,24 +25,24 @@
     String xpath = null;
     String point = null; 
     if (xpointer != null) {
-      pageNumber = xpointer.replaceAll("#xpointer\\(id\\('page(.+)?'\\).*", "$1");
+      pageNumber = xpointer.replaceAll("id\\('page(.+)?'\\).*", "$1");
       if (xpointer.contains("point(")) {
-        xpath = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\)(.*)?/point\\(.+?\\)\\)", "$1");
-        point = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\).*?/point\\((.+)?\\)\\)", "$1");
+        xpath = xpointer.replaceAll("id\\('page.+?'\\)(.*)?/point\\(.+?\\)", "$1");
+        point = xpointer.replaceAll("id\\('page.+?'\\).*?/point\\((.+)?\\)", "$1");
       } else {
-        xpath = xpointer.replaceAll("#xpointer\\(id\\('page.+?'\\)(.*)?.*?\\)", "$1");
+        xpath = xpointer.replaceAll("id\\('page.+?'\\)(.*)?.*?", "$1");
       }
     }
-    String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
+    String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/*", null);
     Date modDate = xmlUtil.toDate(dateStr);
     ExtElement e = new ExtElement();
+    e.setContent(content);
     e.setUid(uid);
     e.setModificationDate(modDate);
     e.setDocumentId(docId);
     e.setPageNumber(pageNumber);
     e.setXpath(xpath);
     e.setPoint(point);
-    e.setContent(content);
     return e;
   }
 
@@ -65,21 +65,14 @@
     if (xpath != null)
       xmlString = xmlString + " xmlNodeId=\"" + xpath + "\"";
     if (pageNumber != null)
-      xmlString = xmlString + " xpointer=\"#xpointer(id('page" + pageNumber + "')";
+      xmlString = xmlString + " xpointer=\"id('page" + pageNumber + "')";
     if (xpath != null)
       xmlString = xmlString + xpath;
     if (point != null)
       xmlString = xmlString + "/point(" + point + ")";
-    xmlString = xmlString + ")\">";
+    xmlString = xmlString + "\">";
     if (content != null) {
-      // TODO wieder ausbauen
-      // write the uid and modificationDate into the content node
-      if (! content.contains("uid")) {
-        int firstClose = content.indexOf(">");
-        if (firstClose != -1)
-          content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose);
-      }
-      xmlString = xmlString + "<content>" + content + "</content>";
+      xmlString = xmlString + content;
     }
     xmlString = xmlString + "</object>";
     return xmlString;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java	Mon Aug 29 17:40:02 2011 +0200
@@ -3,6 +3,7 @@
 import java.util.Date;
 
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
 
 public class ExtObject {
   protected String type; // is set by subclass: element, query, ...
@@ -29,6 +30,14 @@
 
   public void setUid(String uid) {
     this.uid = uid;
+    // write the uid into the content node
+    if (content != null && uid != null && ! content.contains("uid")) {
+      int firstClose = content.indexOf(">");
+      if (firstClose != -1)
+        content = content.substring(0, firstClose) + " uid=\"" + uid + "\"" + content.substring(firstClose);
+    } else if (content != null && uid != null && content.contains("uid")) {
+      content = content.replaceAll("uid=\".*?\"", "uid=\"" + uid + "\"");
+    }
   }
 
   public Date getModificationDate() {
@@ -37,6 +46,16 @@
 
   public void setModificationDate(Date modificationDate) {
     this.modificationDate = modificationDate;
+    // write the modificationDate into the content node
+    if (content != null && modificationDate != null && ! content.contains("modificationDate")) {
+      int firstClose = content.indexOf(">");
+      if (firstClose != -1)
+        content = content.substring(0, firstClose) + " modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose);
+    } else if (content != null && modificationDate != null && content.contains("modificationDate")) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      String modDateStr = xmlUtil.toXsDate(modificationDate);
+      content = content.replaceAll("modificationDate=\".*?\"", "modificationDate=\"" + modDateStr + "\"");
+    }
   }
 
   public String getDocumentId() {
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Mon Aug 29 17:40:02 2011 +0200
@@ -72,11 +72,11 @@
     try {
       test(element);
       String content = element.getContent();
+      Date now = new Date();
+      element.setModificationDate(now);
       String valueStr = element.getXmlString();
       if (content == null)
         throw new ApplicationException("External object: no content element specified in: " + valueStr);
-      Date now = new Date();
-      element.setModificationDate(now);
       String docId = element.getDocumentId();
       String pageNumber = element.getPageNumber();
       String keyStr = docId + "###" + pageNumber;
@@ -94,11 +94,11 @@
   private void updateDBExternalElement(ExtElement element) throws ApplicationException {
     test(element);
     String content = element.getContent();
+    Date now = new Date();
+    element.setModificationDate(now);
     String elementXmlStr = element.getXmlString();
     if (content == null)
       throw new ApplicationException("External object: no content element specified in: " + elementXmlStr);
-    Date now = new Date();
-    element.setModificationDate(now);
     String docId = element.getDocumentId();
     String pageNumber = element.getPageNumber();
     String uid = element.getUid();
@@ -391,21 +391,21 @@
   }
   
   private void deleteSampleData() throws ApplicationException {
-    String xmlNodeId1 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
+    String xmlNodeId1 = "/TEI[1]/text[1]/body[1]/p[1]/s[1]";
     String objectXmlStr1 = 
       "<object type=\"" + "element" + "\" " + 
-              "uid=\"" + "joe" + "\" " + 
-              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId1 + "\"" + 
+              "uid=\"" + "joe@mpiwg-berlin.mpg.de" + "\" " + 
+              "documentId=\"" + "/tei/en/Test_1789.xml" + "\" " + 
+              "xpointer=\"" + "id('page2')" + xmlNodeId1 + "\"" + 
               ">" +
        "</object>";
     ExtElement e1 = ExtElement.parseXmlStr(objectXmlStr1);
-    String xmlNodeId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]";
+    String xmlNodeId2 = "/TEI[1]/text[1]/body[1]/p[1]/s[2]";
     String objectXmlStr2 = 
       "<object type=\"" + "element" + "\" " + 
-              "uid=\"" + "michael" + "\" " + 
-              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId2 + "\"" + 
+              "uid=\"" + "michael@mpiwg-berlin.mpg.de" + "\" " + 
+              "documentId=\"" + "/tei/en/Test_1789.xml" + "\" " + 
+              "xpointer=\"" + "id('page2')" + xmlNodeId2 + "\"" + 
               ">" +
        "</object>";
     ExtElement e2 = ExtElement.parseXmlStr(objectXmlStr2);
@@ -413,8 +413,8 @@
     deleteExternalElement(e2);
     
     ExtQuery q = new ExtQuery();
-    q.setUid("joe");
-    q.setDocumentId("/archimedes/it/l223.xml");
+    q.setUid("joe@mpiwg-berlin.mpg.de");
+    q.setDocumentId("/tei/en/Test_1789.xml");
     ArrayList<ExtObject> objects = readExternalObjects(q); 
     for (int i=0; i<objects.size(); i++) {
       ExtObject o = objects.get(i);
@@ -422,8 +422,8 @@
     }
 
     ExtQuery q2 = new ExtQuery();
-    q2.setUid("michael");
-    q2.setDocumentId("/archimedes/it/l223.xml");
+    q2.setUid("michael@mpiwg-berlin.mpg.de");
+    q2.setDocumentId("/tei/en/Test_1789.xml");
     objects = readExternalObjects(q2); 
     for (int i=0; i<objects.size(); i++) {
       ExtObject o = objects.get(i);
@@ -434,40 +434,40 @@
   private void createSampleData() throws ApplicationException {
     Date now = new Date();
 
-    String sId = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
+    String sId = "/TEI[1]/text[1]/body[1]/p[1]/s[1]";
     ExtElement e = new ExtElement();
-    e.setUid("joe");
+    e.setUid("joe@mpiwg-berlin.mpg.de");
     e.setModificationDate(now);
-    e.setDocumentId("/archimedes/it/l223.xml");
-    e.setPageNumber("17");
+    e.setDocumentId("/tei/en/Test_1789.xml");
+    e.setPageNumber("2");
     e.setXpath(sId);
     e.setPoint(".1");
-    e.setContent("<note>This is a test note to element " + sId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>");
+    e.setContent("<note>This is an annotation of element " + sId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>");
     createExternalElement(e);
     
     ExtElement e2 = new ExtElement();
-    String sId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]";
-    e2.setUid("michael");
+    String sId2 = "/TEI[1]/text[1]/body[1]/p[1]/s[2]";
+    e2.setUid("michael@mpiwg-berlin.mpg.de");
     e2.setModificationDate(now);
-    e2.setDocumentId("/archimedes/it/l223.xml");
-    e2.setPageNumber("17");
+    e2.setDocumentId("/tei/en/Test_1789.xml");
+    e2.setPageNumber("2");
     e2.setXpath(sId2);
     e2.setPoint("18");
-    e2.setContent("<note>This is a test note to element " + sId2 + "</note>");
+    e2.setContent("<note>This is an annotation of element " + sId2 + "</note>");
     createExternalElement(e2);
     
     ExtQuery q1 = new ExtQuery();
-    q1.setUid("joe");
-    q1.setDocumentId("/archimedes/it/l223.xml");
+    q1.setUid("joe@mpiwg-berlin.mpg.de");
+    q1.setDocumentId("/tei/en/Test_1789.xml");
     q1.setQueryType("fulltext");
-    q1.setQueryName("seminario");
+    q1.setQueryName("test");
     createExternalObject(q1);
     
     ExtQuery q2 = new ExtQuery();
-    q2.setUid("michael");
-    q2.setDocumentId("/archimedes/it/l223.xml");
+    q2.setUid("michael@mpiwg-berlin.mpg.de");
+    q2.setDocumentId("/tei/en/Test_1789.xml");
     q2.setQueryType("url");
-    String url = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/archimedes/it/l223.xml&pn=17&mode=text&query-type=fulltextMorph&query=seminario&query-result-pn=1";
+    String url = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/tei/en/Test_1789.xml&pn=2&mode=text&query-type=fulltextMorph&query=test&query-result-pn=1";
     String urlDeresolved = StringUtilEscapeChars.deresolveXmlEntities(url);
     q2.setQueryName(urlDeresolved);
     createExternalObject(q2);
@@ -476,14 +476,14 @@
 
   private void updateSampleData() throws ApplicationException {
     Date now = new Date();
-    String xmlNodeId = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
+    String xmlNodeId = "/TEI[1]/text[1]/body[1]/p[1]/s[1]";
     String objectXmlStr = 
       "<object type=\"" + "element" + "\" " + 
-              "uid=\"" + "joe" + "\" " + 
-              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " + 
-              "xpointer=\"" + "#xpointer(id('page17')" + xmlNodeId + "\"" + 
+              "uid=\"" + "joe@mpiwg-berlin.mpg.de" + "\" " + 
+              "documentId=\"" + "/tei/en/Test_1789.xml" + "\" " + 
+              "xpointer=\"" + "id('page2')" + xmlNodeId + "\"" + 
               ">" +
-          "<content>" + "<note>This is a test note to element " + xmlNodeId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>" + "</content>" +
+          "<content>" + "<note>This is an annotation of element " + xmlNodeId + " with <ref target=\"http://slime.de\">this external link</ref>" + "</note>" + "</content>" +
        "</object>";
     ExtElement e = ExtElement.parseXmlStr(objectXmlStr);
     e.setModificationDate(now);
@@ -492,20 +492,20 @@
     
   private void readSampleData() throws ApplicationException {
     ExtElement elem = new ExtElement();
-    elem.setDocumentId("/archimedes/it/l223.xml");
-    elem.setPageNumber("17");
+    elem.setDocumentId("/tei/en/Test_1789.xml");
+    elem.setPageNumber("2");
     ArrayList<ExtElement> elements = readExternalElements(elem); 
     System.out.println(elements);
     
     ExtQuery q1 = new ExtQuery();
-    q1.setUid("joe");
-    q1.setDocumentId("/archimedes/it/l223.xml");
+    q1.setUid("joe@mpiwg-berlin.mpg.de");
+    q1.setDocumentId("/tei/en/Test_1789.xml");
     ArrayList<ExtObject> objects = readExternalObjects(q1); 
     System.out.println(objects);
 
     ExtQuery q2 = new ExtQuery();
-    q2.setUid("michael");
-    q2.setDocumentId("/archimedes/it/l223.xml");
+    q2.setUid("michael@mpiwg-berlin.mpg.de");
+    q2.setDocumentId("/tei/en/Test_1789.xml");
     objects = readExternalObjects(q2); 
     System.out.println(objects);
   }
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java	Mon Aug 29 17:40:02 2011 +0200
@@ -25,6 +25,7 @@
   public static String MPDL_EXIST_ADMIN_USER_PW = MPDL_SYSTEM_PROPERTIES.getProperty("exist.adminUserPW");
   public static String MPDL_ECHO_RELAXNG_PATH = MPDL_SYSTEM_PROPERTIES.getProperty("exist.echoRelaxNGPath");
   public static String MPDL_TEILITE_RELAXNG_PATH = MPDL_SYSTEM_PROPERTIES.getProperty("exist.teiRelaxNGPath");
+  public static boolean MPDL_GENERATE_PDF = new Boolean(MPDL_SYSTEM_PROPERTIES.getProperty("exist.generatePdf"));
   
   // eSciDoc settings
   public static String MPDL_ESCIDOC_HOST_NAME = MPDL_SYSTEM_PROPERTIES.getProperty("escidoc.hostname");
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/.DS_Store
Binary file software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/.DS_Store has changed
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlMorphAnalyzer.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlMorphAnalyzer.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlMorphAnalyzer.java	Mon Aug 29 17:40:02 2011 +0200
@@ -143,7 +143,7 @@
    * Creates a TokenStream which tokenizes all the text in the provided Reader.
    *
    * @return A TokenStream build from a StandardTokenizer filtered with
-   *         StandardFilter, LowerCaseFilter, StopFilter, DonatusStemFilter
+   *         MpdlFilter, LowerCaseFilter, StopFilter, MpdlStemFilter
    */
   public TokenStream tokenStream(String fieldName, Reader reader) {
     MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(language);
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Mon Aug 29 17:40:02 2011 +0200
@@ -19,9 +19,10 @@
 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
 
 public class MpdlNormalizer {
-  public static int MODE_4LEXICA = 1;  // normalization for lexica etc. which have sometimes only ascii in it
-  public static int MODE_4HUMAN_READERS = 2;  // normalization for human readers
-  private int normMode = MODE_4LEXICA;  // Default 
+  public static int DISPLAY = 1;  // normalization in DISPLAY mode
+  public static int DICTIONARY = 2;  // normalization in DICTIONARY mode
+  public static int SEARCH = 3;  // normalization in SEARCH mode; never used so far in indexing because it does not support the morph. lexicons such as CELEX (e.g. eingeschränkt would not be stemmed to eingeschraenkt) 
+  private int normMode = DICTIONARY;  // Default e.g. for indexing and querying
   private String[] normFunctionsToUse = {"reg", "norm"};  // default is to use all of these normalization functions
   private String language;
   private int[] offsets;
@@ -36,6 +37,10 @@
     this.language = language;
   }
 
+  public String getLanguage() {
+    return language;  
+  }
+  
   public void setNormMode(int normMode) {
     this.normMode = normMode;  
   }
@@ -61,10 +66,12 @@
     }
     if (useNormFunction()) {
       // normalize the string by string replacements
-      if (normMode == MODE_4LEXICA)
-        normStr = normalize4Lexica(normStr, null);
-      else if (normMode == MODE_4HUMAN_READERS)
-        normStr = normalize4HumanReaders(normStr);
+      if (normMode == DICTIONARY) {
+        normStr = normalize(normStr, DICTIONARY);
+      } else if (normMode == DISPLAY)
+        normStr = normalize(normStr, DISPLAY);
+      else if (normMode == SEARCH)
+        normStr = normalize(normStr, SEARCH);
     }
     return normStr;
   }
@@ -89,7 +96,269 @@
     return useNorm;
   }
 
+  public String deNormalizeToRegExpr(String s) {
+    // TODO all characters in all languages
+    if (language.equals("la") || language.equals("lat")) {
+      StringBuffer buf = new StringBuffer();
+      if (s.indexOf("ae") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ae", "\u0119");
+        String str2 = s;
+        str2 = str2.replaceAll("ae", "\u00c6");
+        String str3 = s;
+        str3 = str3.replaceAll("ae", "\u00e6");
+        buf.append(str1 + "|" + str2 + "|" + str3 + "|");
+      }
+      if (s.indexOf("oe") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("oe", "\u0152");
+        String str2 = s;
+        str2 = str2.replaceAll("oe", "\u0153");
+        buf.append(str1 + "|" + str2 + "|");
+      }
+      if (s.indexOf("ss") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ss", "\u00df");
+        buf.append(str1 + "|");
+      }
+      boolean beginWord = true;
+      for (int i = 0; i < s.length(); i++) {
+        char c = s.charAt(i);
+        if (! beginWord) 
+          c = Character.toLowerCase(c);
+        beginWord = Character.isWhitespace(c);
+        String replace = new String();
+        switch (c) {
+          case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; 
+          case 'c': replace = "[c\u00c7\u00e7]"; break;
+          case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1ebd]"; break; 
+          case 'i': replace = "[ij\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break;
+          case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]"; break; 
+          case 'u': replace = "[uv\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; 
+          case 's': replace = "[s\u017f]"; break; 
+          default: replace += c; break;
+        }
+        buf.append(replace);
+      }
+      return buf.toString();
+    } else if (language.equals("en")) {
+      StringBuffer buf = new StringBuffer();
+      if (s.indexOf("ae") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ae", "\u0119");
+        String str2 = s;
+        str2 = str2.replaceAll("ae", "\u00c6");
+        String str3 = s;
+        str3 = str3.replaceAll("ae", "\u00e6");
+        buf.append(str1 + "|" + str2 + "|" + str3 + "|");
+      }
+      if (s.indexOf("oe") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("oe", "\u0152");
+        String str2 = s;
+        str2 = str2.replaceAll("oe", "\u0153");
+        buf.append(str1 + "|" + str2 + "|");
+      }
+      if (s.indexOf("ss") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ss", "\u00df");
+        buf.append(str1 + "|");
+      }
+      boolean beginWord = true;
+      for (int i = 0; i < s.length(); i++) {
+        char c = s.charAt(i);
+        if (! beginWord) 
+          c = Character.toLowerCase(c);
+        beginWord = Character.isWhitespace(c);
+        String replace = new String();
+        switch (c) {
+          case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; 
+          case 'c': replace = "[c\u00c7\u00e7]"; break;
+          case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1e8d]"; break; 
+          case 'i': replace = "[i\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break;
+          case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]‚"; break; 
+          case 'u': replace = "[u\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; 
+          case 's': replace = "[s\u017f]"; break; 
+          default: replace += c; break;
+        }
+        buf.append(replace);
+      }
+      return buf.toString();
+    } else if (language.equals("de")) {
+      StringBuffer buf = new StringBuffer();
+      if (s.indexOf("ss") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ss", "\u00df");
+        buf.append(str1 + "|");
+      }
+      if (s.indexOf("ae") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ae", "\u00e4");
+        buf.append(str1 + "|");
+      }
+      if (s.indexOf("oe") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("oe", "\u00f6");
+        buf.append(str1 + "|");
+      }
+      if (s.indexOf("ue") != -1) {
+        String str1 = s;
+        str1 = str1.replaceAll("ue", "\u00fc");
+        buf.append(str1 + "|");
+      }
+      boolean beginWord = true;
+      for (int i = 0; i < s.length(); i++) {
+        char c = s.charAt(i);
+        if (! beginWord) 
+          c = Character.toLowerCase(c);
+        beginWord = Character.isWhitespace(c);
+        String replace = new String();
+        switch (c) {
+          case 'e': replace = "[e\u00e9]"; break; 
+          default: replace += c; break;
+        }
+        buf.append(replace);
+      }
+      return buf.toString();
+    } else {      // unknown or no language
+      return s;
+    }
+  }
+
+  private String normalize(String s, int mode) {
+    String inputStr = s;
+    StringReader strReader = new StringReader(inputStr + "\n");
+    String retStr = "";
+    String token = "";
+    try {
+      if (Language.getInstance().isLatin(language)) {
+        MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isArabic(language)) {
+        MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isGerman(language)) {
+        MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.CELEX);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isGreek(language)) {
+        MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SIGMA);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isEnglish(language)) {
+        MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isFrench(language)) {
+        MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.CELEX);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isItalian(language)) {
+        MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isDutch(language)) {
+        MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isChinese(language)) {
+        MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader);
+        if (mode == DISPLAY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP);
+        else if (mode == DICTIONARY)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DICT);
+        else if (mode == SEARCH)
+          mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.SEARCH);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else {
+        retStr = s;  // return the string unchanged
+      }
+    } catch (IOException e ) {
+      // nothing cause IOException is not needed for a StringReader
+    }
+    return retStr;
+  }
+  
   /**
+   * Old code from Arboreal (Malcolm Hyman) 
    * Applies the normalization rules in <code>language</code> to
    * <code>s</code>, with offset tracking.<p>
    *
@@ -521,7 +790,7 @@
           case '\u00b3': replace = "3"; break;
           case '\u2074': replace = "4"; break;
           case '\u2075': replace = "5"; break;
-          // original by Malcolm Hyman: with the following replacements // TODO uncomment these 3 lines
+          // original by Malcolm Hyman: with the following replacements 
           // case '\u3000': replace = " "; break;
           // case '\u3001': replace = ","; break;
           // case '\u3002': replace = "."; break;
@@ -892,221 +1161,6 @@
     }
   }
   
-  public String deNormalizeToRegExpr(String s) {
-    // TODO all characters in all languages
-    if (language.equals("la") || language.equals("lat")) {
-      StringBuffer buf = new StringBuffer();
-      if (s.indexOf("ae") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ae", "\u0119");
-        String str2 = s;
-        str2 = str2.replaceAll("ae", "\u00c6");
-        String str3 = s;
-        str3 = str3.replaceAll("ae", "\u00e6");
-        buf.append(str1 + "|" + str2 + "|" + str3 + "|");
-      }
-      if (s.indexOf("oe") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("oe", "\u0152");
-        String str2 = s;
-        str2 = str2.replaceAll("oe", "\u0153");
-        buf.append(str1 + "|" + str2 + "|");
-      }
-      if (s.indexOf("ss") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ss", "\u00df");
-        buf.append(str1 + "|");
-      }
-      boolean beginWord = true;
-      for (int i = 0; i < s.length(); i++) {
-        char c = s.charAt(i);
-        if (! beginWord) 
-          c = Character.toLowerCase(c);
-        beginWord = Character.isWhitespace(c);
-        String replace = new String();
-        switch (c) {
-          case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; 
-          case 'c': replace = "[c\u00c7\u00e7]"; break;
-          case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1ebd]"; break; 
-          case 'i': replace = "[ij\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break;
-          case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]"; break; 
-          case 'u': replace = "[uv\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; 
-          case 's': replace = "[s\u017f]"; break; 
-          default: replace += c; break;
-        }
-        buf.append(replace);
-      }
-      return buf.toString();
-    } else if (language.equals("en")) {
-      StringBuffer buf = new StringBuffer();
-      if (s.indexOf("ae") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ae", "\u0119");
-        String str2 = s;
-        str2 = str2.replaceAll("ae", "\u00c6");
-        String str3 = s;
-        str3 = str3.replaceAll("ae", "\u00e6");
-        buf.append(str1 + "|" + str2 + "|" + str3 + "|");
-      }
-      if (s.indexOf("oe") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("oe", "\u0152");
-        String str2 = s;
-        str2 = str2.replaceAll("oe", "\u0153");
-        buf.append(str1 + "|" + str2 + "|");
-      }
-      if (s.indexOf("ss") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ss", "\u00df");
-        buf.append(str1 + "|");
-      }
-      boolean beginWord = true;
-      for (int i = 0; i < s.length(); i++) {
-        char c = s.charAt(i);
-        if (! beginWord) 
-          c = Character.toLowerCase(c);
-        beginWord = Character.isWhitespace(c);
-        String replace = new String();
-        switch (c) {
-          case 'a': replace = "[a\u00c0\u00c1\u00c2\u00c4\u00e0\u00e1\u00e2\u00e4]"; break; 
-          case 'c': replace = "[c\u00c7\u00e7]"; break;
-          case 'e': replace = "[e\u00c8\u00c9\u00ca\u00cb\u00e8\u00e9\u00ea\u00eb\u0113\u0115\u1e8d]"; break; 
-          case 'i': replace = "[i\u00cc\u00cd\u00ce\u00cf\u00ec\u00ed\u00ee\u00ef\u012a\u012b\u012c\u012d]"; break;
-          case 'o': replace = "[o\u00d2\u00d3\u00d4\u00d6\u00f2\u00f3\u00f4\u00f6\u014c\u014d\u014e\u014f]‚"; break; 
-          case 'u': replace = "[u\u00d9\u00da\u00db\u00dc\u00f9\u00fa\u00fb\u00fc\u016a\u016b\u016c\u016d]"; break; 
-          case 's': replace = "[s\u017f]"; break; 
-          default: replace += c; break;
-        }
-        buf.append(replace);
-      }
-      return buf.toString();
-    } else if (language.equals("de")) {
-      StringBuffer buf = new StringBuffer();
-      if (s.indexOf("ss") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ss", "\u00df");
-        buf.append(str1 + "|");
-      }
-      if (s.indexOf("ae") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ae", "\u00e4");
-        buf.append(str1 + "|");
-      }
-      if (s.indexOf("oe") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("oe", "\u00f6");
-        buf.append(str1 + "|");
-      }
-      if (s.indexOf("ue") != -1) {
-        String str1 = s;
-        str1 = str1.replaceAll("ue", "\u00fc");
-        buf.append(str1 + "|");
-      }
-      boolean beginWord = true;
-      for (int i = 0; i < s.length(); i++) {
-        char c = s.charAt(i);
-        if (! beginWord) 
-          c = Character.toLowerCase(c);
-        beginWord = Character.isWhitespace(c);
-        String replace = new String();
-        switch (c) {
-          case 'e': replace = "[e\u00e9]"; break; 
-          default: replace += c; break;
-        }
-        buf.append(replace);
-      }
-      return buf.toString();
-    } else {      // unknown or no language
-      return s;
-    }
-  }
-
-  private String normalize4HumanReaders(String s) {
-    StringReader strReader = new StringReader(s + "\n");
-    String retStr = "";
-    String token = "";
-    try {
-      if (Language.getInstance().isLatin(language)) {
-        MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isArabic(language)) {
-        MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isGerman(language)) {
-        MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isGreek(language)) {
-        MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isEnglish(language)) {
-        MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isFrench(language)) {
-        MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isItalian(language)) {
-        MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isDutch(language)) {
-        MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else if (Language.getInstance().isChinese(language)) {
-        MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader);
-        mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP);
-        while (token != null) {
-          token = mpdlNormalizerLex.yylex();
-          if (token != null)
-            retStr += token;
-        }
-      } else {
-        return normalize4Lexica(s, null);  // old function
-      }
-    } catch (IOException e ) {
-      // nothing cause IOException is not needed for a StringReader
-    }
-    return retStr;
-  }
-
   /*
   // explicit words
   normStr = normStr.replaceAll("aliàs", "alias");
@@ -1165,7 +1219,6 @@
     } else if (Language.getInstance().isLatin(language)) {
       retStr = "AEIOUaeiouÆœęàèòù";
     }
-    // TODO all languages
     return retStr;
   }
 
@@ -1180,7 +1233,6 @@
                "bcdfghklmnpqrstvwxz" +
                "ſß";  // long/sharp S
     } 
-    // TODO all languages
     return retStr;
   }
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Mon Aug 29 17:40:02 2011 +0200
@@ -59,10 +59,15 @@
       case ')': isTokenChar = false; break;
       case '[': isTokenChar = false; break;
       case ']': isTokenChar = false; break;
+      case '{': isTokenChar = false; break;
+      case '}': isTokenChar = false; break;
       case '<': isTokenChar = false; break;
       case '>': isTokenChar = false; break;
+      case '/': isTokenChar = false; break;
+      case '=': isTokenChar = false; break;
       case '&': isTokenChar = false; break;
       case '+': isTokenChar = false; break;
+      case '#': isTokenChar = false; break;
       case '"': isTokenChar = false; break;
       case '„': isTokenChar = false; break;
       case '“': isTokenChar = false; break;
@@ -71,6 +76,7 @@
       case '\'': isTokenChar = false; break;
       case '\t': isTokenChar = false; break; // do not break words which have tabs in it
       case '\n': isTokenChar = false; break;  // do not break words which are on another line 
+      case '\u2425': isTokenChar = false; break;  // special char for marking xml elements 
     }
     return isTokenChar;
   }
@@ -80,8 +86,9 @@
     if (isInNotWordDelimMode) {
       switch (c) {
         case ' ': isTokenCharInNotWordDelimMode = true; break;
+        case '-': isTokenCharInNotWordDelimMode = true; break;
         case '\t': isTokenCharInNotWordDelimMode = true; break; 
-        case '\n': isTokenCharInNotWordDelimMode = true; break; 
+        case '\n': isTokenCharInNotWordDelimMode = true; break;
       }
     }
     return isTokenCharInNotWordDelimMode;
@@ -206,7 +213,8 @@
         return flush();
       else
         c = ioBuffer[bufferIndex++];
-      switch(Character.getType(c)) {
+      int charType = Character.getType(c);
+      switch(charType) {
         case Character.DECIMAL_DIGIT_NUMBER:
         case Character.LOWERCASE_LETTER:
         case Character.UPPERCASE_LETTER:
@@ -222,6 +230,11 @@
           }
           push(c);
           return flush();
+        case Character.SURROGATE:  // neu eingefügt: Lösung von Ticket 121/117: Erkennung von Codepoints über FFFF
+          push(c);
+          if (length == MAX_WORD_LEN) 
+            return flush();
+          break;
         default:
           if (length>0) 
             return flush();
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:02 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for Arabic text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-02-28
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:02 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexAR.lex</tt>
  */
 public class MpdlNormalizerLexAR {
@@ -40,14 +39,16 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  1, 1
+     0,  0,  1,  1,  2,  2,  3, 3
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\2\uffbf\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\4"+
+    "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+    "\uff82\0";
 
   /** 
    * Translates characters to character classes
@@ -60,10 +61,10 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\3\0\1\1\1\2\1\3\1\4";
+    "\4\0\2\1\1\2\1\3\1\4\1\5";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[7];
+    int [] result = new int[10];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -88,10 +89,11 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\3\0\6\0\11\0\11\0\11\0\11";
+    "\0\0\0\5\0\12\0\17\0\24\0\31\0\24\0\24"+
+    "\0\24\0\24";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[7];
+    int [] result = new int[10];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -114,11 +116,12 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\4\1\0\1\5\1\4\1\6\1\5\1\4\1\7"+
-    "\1\5\3\0";
+    "\1\5\1\6\1\5\1\0\1\7\1\5\1\6\1\5"+
+    "\1\10\1\7\1\5\1\6\1\5\1\11\1\7\1\5"+
+    "\1\6\1\5\1\12\1\7\7\0\1\5\2\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[12];
+    int [] result = new int[30];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -156,10 +159,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\3\0\4\11";
+    "\4\0\1\11\1\1\4\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[7];
+    int [] result = new int[10];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -236,6 +239,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -267,7 +272,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 10) {
+    while (i < 42) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -534,28 +539,35 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 5: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "");
+		}
+          }
+        case 6: break;
         case 4: 
           { switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
           }
-        case 5: break;
+        case 7: break;
         case 2: 
           { problem = 1; add(yytext());
           }
-        case 6: break;
+        case 8: break;
         case 3: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 7: break;
+        case 9: break;
         case 1: 
           { add(yytext());
           }
-        case 8: break;
+        case 10: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-02-28
  *
  */
 
@@ -30,17 +29,24 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 %%
 
 @ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
 . { add(yytext()); }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -55,7 +61,17 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "");
 		}
 	}
 }
@@ -65,7 +81,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */
 
 /*
  * Normalization rules for German text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:03 from the specification file
+ * on 03.08.11 18:24 from the specification file
  * <tt>MpdlNormalizerLexDE.lex</tt>
  */
 public class MpdlNormalizerLexDE {
@@ -42,17 +41,18 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  1,  1,  3,  3,  4, 4
+     0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5, 5
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\15\32\2\6\0\1\6\15\2\1\10\5\2"+
-    "\1\4\5\2\111\0\1\11\21\0\1\12\5\0\1\13\2\0\1\14"+
-    "\4\0\1\11\21\0\1\12\5\0\1\13\202\0\1\3\u01e4\0\1\7"+
-    "\1\0\1\5\ufc99\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+
+    "\32\4\6\0\1\11\2\4\1\5\12\4\1\13\5\4\1\7\5\4"+
+    "\1\1\1\0\1\1\106\0\1\14\21\0\1\15\5\0\1\16\2\0"+
+    "\1\17\4\0\1\14\21\0\1\15\5\0\1\16\202\0\1\6\u01e4\0"+
+    "\1\12\1\0\1\10\ufc99\0";
 
   /** 
    * Translates characters to character classes
@@ -65,12 +65,12 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\5\0\1\1\1\2\1\3\1\4\3\1\1\5\3\1"+
-    "\1\6\1\7\1\10\1\11\1\12\1\13\1\14\1\15"+
-    "\1\16";
+    "\6\0\2\1\1\2\1\3\1\4\3\1\1\5\1\6"+
+    "\1\3\3\1\1\7\1\10\1\11\1\12\1\13\1\14"+
+    "\1\15\1\16\1\17";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[25];
+    int [] result = new int[29];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -95,13 +95,13 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\16\0\34\0\52\0\70\0\106\0\106\0\106"+
-    "\0\106\0\124\0\142\0\160\0\106\0\176\0\214\0\232"+
-    "\0\106\0\106\0\106\0\106\0\106\0\106\0\106\0\106"+
-    "\0\106";
+    "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+
+    "\0\146\0\146\0\146\0\210\0\231\0\252\0\146\0\146"+
+    "\0\167\0\273\0\314\0\335\0\146\0\146\0\146\0\146"+
+    "\0\146\0\146\0\146\0\146\0\146";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[25];
+    int [] result = new int[29];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -124,18 +124,23 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\6\1\0\1\6\1\7\11\6\1\10\1\6\1\11"+
-    "\1\6\1\7\1\12\1\6\1\13\1\6\1\14\4\6"+
-    "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+
-    "\1\6\1\14\4\6\2\10\1\15\1\6\1\7\1\16"+
-    "\1\10\1\17\1\10\1\20\1\21\1\22\1\23\1\24"+
-    "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+
-    "\1\6\1\14\3\6\1\25\1\10\23\0\1\26\1\0"+
-    "\1\27\15\0\1\30\15\0\1\31\13\0\1\26\1\0"+
-    "\1\23\15\0\1\21\15\0\1\22\6\0";
+    "\1\7\1\10\1\7\1\0\1\7\1\10\1\11\1\10"+
+    "\1\7\1\10\6\7\1\12\1\7\1\10\1\7\1\13"+
+    "\1\7\1\10\1\11\1\14\1\7\1\15\1\7\1\16"+
+    "\4\7\1\12\1\7\1\10\1\7\1\17\1\7\1\10"+
+    "\1\11\1\14\1\7\1\15\1\7\1\16\4\7\1\12"+
+    "\1\7\1\10\1\7\1\20\1\7\1\10\1\11\1\14"+
+    "\1\7\1\15\1\7\1\16\4\7\2\12\1\21\1\12"+
+    "\1\17\1\7\1\10\1\11\1\22\1\12\1\23\1\12"+
+    "\1\24\1\25\1\26\1\27\1\30\1\12\1\7\1\10"+
+    "\1\7\1\17\1\7\1\10\1\11\1\14\1\7\1\15"+
+    "\1\7\1\16\3\7\1\31\1\12\23\0\1\7\20\0"+
+    "\1\7\5\0\1\32\1\0\1\33\10\0\1\7\7\0"+
+    "\1\34\20\0\1\35\10\0\1\7\5\0\1\32\1\0"+
+    "\1\27\10\0\1\7\7\0\1\25\20\0\1\26\6\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[168];
+    int [] result = new int[238];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -173,10 +178,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\5\0\4\11\3\1\1\11\3\1\11\11";
+    "\6\0\1\11\1\1\3\11\3\1\2\11\4\1\11\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[25];
+    int [] result = new int[29];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -253,6 +258,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -284,7 +291,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 66) {
+    while (i < 88) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -551,68 +558,75 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 10: 
+        case 11: 
           { add("sz");
           }
-        case 15: break;
+        case 16: break;
         case 3: 
           { problem = 1; add(yytext());
           }
-        case 16: break;
-        case 6: 
+        case 17: break;
+        case 7: 
           { add("ae");
           }
-        case 17: break;
+        case 18: break;
         case 2: 
           { add("s");
           }
-        case 18: break;
+        case 19: break;
         case 4: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 19: break;
-        case 12: 
+        case 20: break;
+        case 13: 
           { add("ü");
           }
-        case 20: break;
-        case 8: 
-          { add("ue");
-          }
         case 21: break;
-        case 11: 
-          { add("u");
+        case 9: 
+          { add("ue");
           }
         case 22: break;
-        case 13: 
+        case 6: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 23: break;
+        case 12: 
+          { add("u");
+          }
+        case 24: break;
+        case 14: 
           { add("ä");
           }
-        case 23: break;
+        case 25: break;
         case 1: 
           { add(yytext());
           }
-        case 24: break;
-        case 9: 
+        case 26: break;
+        case 10: 
           { add("ss");
           }
-        case 25: break;
-        case 7: 
+        case 27: break;
+        case 8: 
           { add("oe");
           }
-        case 26: break;
-        case 14: 
+        case 28: break;
+        case 15: 
           { add("ö");
           }
-        case 27: break;
+        case 29: break;
         case 5: 
           { switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
           }
-        case 28: break;
+        case 30: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -31,8 +30,14 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
@@ -79,10 +84,11 @@
 // default
 
 @ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
 . { add(yytext()); }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -97,17 +103,28 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
 	}
 }
 
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+	}
+}
+
+
 
 /*
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:23 */
 
 /*
  * Normalization rules for Greek text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-08-03
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:03 from the specification file
+ * on 03.08.11 18:23 from the specification file
  * <tt>MpdlNormalizerLexEL.lex</tt>
  */
 public class MpdlNormalizerLexEL {
@@ -31,6 +30,7 @@
   public static final int SEARCH = 6;
   public static final int DICT = 4;
   public static final int YYINITIAL = 0;
+  public static final int SIGMA = 8;
   public static final int DISP = 2;
 
   /**
@@ -40,18 +40,19 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  3, 3
+     0,  0,  1,  1,  2,  2,  3,  3,  4, 4
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\3\32\3\6\0\32\3\u0331\0\1\4\1\5"+
-    "\1\6\1\7\15\0\1\2\3\0\2\2\11\0\1\10\1\11\1\12"+
-    "\u1ba1\0\1\13\1\0\1\15\1\0\1\16\1\0\1\20\1\0\1\21"+
-    "\1\0\1\22\1\0\1\23\65\0\1\14\17\0\1\17\57\0\1\24"+
-    "\ue00d\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+    "\32\5\6\0\1\6\2\5\1\6\20\5\1\6\5\5\1\1\1\0"+
+    "\1\1\u032e\0\1\7\1\10\1\11\1\12\15\0\1\4\3\0\1\4"+
+    "\1\30\11\0\1\13\1\14\1\15\u1ba1\0\1\16\1\0\1\20\1\0"+
+    "\1\21\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26\65\0"+
+    "\1\17\17\0\1\22\57\0\1\27\ue00d\0";
 
   /** 
    * Translates characters to character classes
@@ -64,14 +65,14 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
-    "\1\10\1\11\1\12\1\13\12\1\1\14\1\0\1\15"+
-    "\1\0\1\16\1\0\1\17\1\0\1\20\1\0\1\21"+
-    "\1\0\1\22\1\0\1\23\1\0\1\24\1\0\1\25"+
-    "\1\0";
+    "\5\0\2\1\2\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\12\1\1\14\1\15\1\16"+
+    "\1\0\1\17\1\0\1\20\1\0\1\21\1\0\1\22"+
+    "\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26"+
+    "\1\0\1\27\1\0";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[45];
+    int [] result = new int[50];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -96,15 +97,16 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\25\0\52\0\77\0\124\0\124\0\124\0\124"+
-    "\0\124\0\124\0\124\0\124\0\124\0\124\0\124\0\151"+
-    "\0\176\0\223\0\250\0\275\0\322\0\347\0\374\0\u0111"+
-    "\0\u0126\0\124\0\u013b\0\124\0\u0150\0\124\0\u0165\0\124"+
-    "\0\u017a\0\124\0\u018f\0\124\0\u01a4\0\124\0\u01b9\0\124"+
-    "\0\u01ce\0\124\0\u01e3\0\124\0\u01f8";
+    "\0\0\0\31\0\62\0\113\0\144\0\175\0\226\0\175"+
+    "\0\226\0\175\0\175\0\175\0\175\0\175\0\175\0\175"+
+    "\0\175\0\175\0\257\0\310\0\341\0\372\0\u0113\0\u012c"+
+    "\0\u0145\0\u015e\0\u0177\0\u0190\0\175\0\175\0\175\0\u01a9"+
+    "\0\175\0\u01c2\0\175\0\u01db\0\175\0\u01f4\0\175\0\u020d"+
+    "\0\175\0\u0226\0\175\0\u023f\0\175\0\u0258\0\175\0\u0271"+
+    "\0\175\0\u028a";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[45];
+    int [] result = new int[50];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -127,23 +129,31 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\5\1\0\24\5\1\6\1\5\1\7\1\10\1\11"+
-    "\1\12\1\13\1\14\1\15\1\16\13\5\1\17\1\5"+
-    "\1\7\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
-    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
-    "\1\30\1\31\1\5\1\6\1\5\1\7\1\10\1\11"+
-    "\1\12\1\13\1\14\1\15\1\16\1\20\1\21\1\22"+
-    "\1\23\1\24\1\25\1\26\1\27\1\30\1\31\26\0"+
-    "\1\32\1\33\23\0\1\34\1\35\23\0\1\36\1\37"+
-    "\23\0\1\40\1\41\23\0\1\42\1\43\23\0\1\44"+
-    "\1\45\23\0\1\46\1\47\23\0\1\50\1\51\23\0"+
-    "\1\52\1\53\23\0\1\54\1\55\23\0\1\32\24\0"+
-    "\1\34\24\0\1\36\24\0\1\40\24\0\1\42\24\0"+
-    "\1\44\24\0\1\46\24\0\1\50\24\0\1\52\24\0"+
-    "\1\54\23\0";
+    "\1\6\1\7\1\6\1\0\1\6\1\10\1\11\1\12"+
+    "\1\13\1\14\1\15\1\16\1\17\1\20\14\6\1\7"+
+    "\1\6\1\21\1\6\1\10\1\11\1\12\1\13\1\14"+
+    "\1\15\1\16\1\17\1\20\14\6\1\7\1\6\1\22"+
+    "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+    "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\35"+
+    "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+    "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\22"+
+    "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+    "\1\31\1\32\1\33\1\34\1\36\33\0\1\6\31\0"+
+    "\1\37\1\40\23\0\1\40\3\0\1\41\1\42\23\0"+
+    "\1\42\3\0\1\43\1\44\23\0\1\44\3\0\1\45"+
+    "\1\46\23\0\1\46\3\0\1\47\1\50\23\0\1\50"+
+    "\3\0\1\51\1\52\23\0\1\52\3\0\1\53\1\54"+
+    "\23\0\1\54\3\0\1\55\1\56\23\0\1\56\3\0"+
+    "\1\57\1\60\23\0\1\60\3\0\1\61\1\62\23\0"+
+    "\1\62\3\0\1\37\30\0\1\41\30\0\1\43\30\0"+
+    "\1\45\30\0\1\47\30\0\1\51\30\0\1\53\30\0"+
+    "\1\55\30\0\1\57\30\0\1\61\25\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[525];
+    int [] result = new int[675];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -181,12 +191,13 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\4\0\13\11\12\1\1\11\1\0\1\11\1\0\1\11"+
+    "\5\0\1\11\1\1\1\11\1\1\11\11\12\1\3\11"+
     "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
-    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0";
+    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
+    "\1\0\1\11\1\0";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[45];
+    int [] result = new int[50];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -263,6 +274,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -294,7 +307,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 82) {
+    while (i < 112) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -561,116 +574,127 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 21: 
+        case 23: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ῴ");
           }
-        case 22: break;
-        case 6: 
+        case 24: break;
+        case 5: 
           { add("ή");
           }
-        case 23: break;
-        case 15: 
+        case 25: break;
+        case 17: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ή");
           }
-        case 24: break;
-        case 7: 
+        case 26: break;
+        case 13: 
+          { add("σ");
+          }
+        case 27: break;
+        case 6: 
           { add("ί");
           }
-        case 25: break;
+        case 28: break;
         case 1: 
           { add(yytext());
           }
-        case 26: break;
-        case 20: 
+        case 29: break;
+        case 22: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ώ");
           }
-        case 27: break;
-        case 17: 
+        case 30: break;
+        case 11: 
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized.replaceAll(LB, "");
+		}
+          }
+        case 31: break;
+        case 19: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ί");
           }
-        case 28: break;
-        case 13: 
+        case 32: break;
+        case 15: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ᾴ");
           }
-        case 29: break;
-        case 8: 
+        case 33: break;
+        case 7: 
           { add("ό");
           }
-        case 30: break;
-        case 12: 
+        case 34: break;
+        case 14: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ά");
           }
-        case 31: break;
-        case 9: 
+        case 35: break;
+        case 12: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 36: break;
+        case 8: 
           { add("ύ");
           }
-        case 32: break;
-        case 3: 
+        case 37: break;
+        case 2: 
           { problem = 1; add(yytext());
           }
-        case 33: break;
-        case 18: 
+        case 38: break;
+        case 20: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ό");
           }
-        case 34: break;
-        case 4: 
+        case 39: break;
+        case 3: 
           { add("ά");
           }
-        case 35: break;
-        case 2: 
+        case 40: break;
+        case 10: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 36: break;
-        case 10: 
+        case 41: break;
+        case 9: 
           { add("ώ");
           }
-        case 37: break;
-        case 14: 
+        case 42: break;
+        case 16: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("έ");
           }
-        case 38: break;
-        case 16: 
+        case 43: break;
+        case 18: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ῄ");
           }
-        case 39: break;
-        case 5: 
+        case 44: break;
+        case 4: 
           { add("έ");
           }
-        case 40: break;
-        case 11: 
-          { switch (problem) {
-			case 1: return "";
-			default: return normalized;
-		}
-          }
-        case 41: break;
-        case 19: 
+        case 45: break;
+        case 21: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("ύ");
           }
-        case 42: break;
+        case 46: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-08-03
  *
  */
 
@@ -20,6 +19,7 @@
 // Greek: el, grc
 
 %states DISP, DICT, SEARCH
+%state SIGMA
 
 %{
 	private String original = "";
@@ -30,8 +30,14 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 wordend =  [νρς]? {END}
@@ -41,9 +47,8 @@
 
 %%
 
-<DISP, DICT, SEARCH> {
 
-// replace tonos by oxia 
+// always replace tonos by oxia 
 // (although this should really be corrected in the text rather than normalized)
 ά { add("ά"); }
 έ { add("έ"); }
@@ -53,9 +58,8 @@
 ύ { add("ύ"); }
 ώ { add("ώ"); }
 
-}
 
-<DICT, SEARCH> {
+<DICT, SEARCH, SIGMA> {
 
 ὰ / {wordend} { add("ά"); }
 ᾲ / {wordend} { add("ᾴ"); }
@@ -72,20 +76,22 @@
 
 }
 
-<DISP, DICT, SEARCH> {
+<SIGMA> {
+
+ς { add("σ"); }
+
+}
+
+// default
 
 @ { problem = 1; add(yytext()); }
 {Latin} { problem = 1; add(yytext()); }
 
-}
-
-
-// default
-
+{LB} { add(yytext()); }
 . { add(yytext()); }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -95,12 +101,22 @@
 	}
 }
 
-<DICT> {
+<DICT, SIGMA> {
 
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
 		}
 	}
 }
@@ -110,7 +126,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for English text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:03 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexEN.lex</tt>
  */
 public class MpdlNormalizerLexEN {
@@ -40,14 +39,16 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  1, 1
+     0,  0,  1,  1,  2,  2,  3, 3
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+    "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+    "\u0101\0\1\4\ufe80\0";
 
   /** 
    * Translates characters to character classes
@@ -60,10 +61,10 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\3\0\1\1\1\2\1\3\1\4\1\5";
+    "\4\0\2\1\1\2\1\3\1\4\1\5\1\6";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -88,10 +89,11 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14";
+    "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+
+    "\0\30\0\30\0\30";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -114,11 +116,13 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+
-    "\1\4\1\10\1\7\1\5\4\0";
+    "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+
+    "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+
+    "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+
+    "\10\0\1\5\3\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[16];
+    int [] result = new int[36];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -156,10 +160,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\3\0\5\11";
+    "\4\0\1\11\1\1\5\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -236,6 +240,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -267,7 +273,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 14) {
+    while (i < 46) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -537,29 +543,36 @@
         case 5: 
           { switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
           }
-        case 6: break;
+        case 7: break;
         case 2: 
           { problem = 1; add(yytext());
           }
-        case 7: break;
+        case 8: break;
         case 4: 
           { add("s");
           }
-        case 8: break;
+        case 9: break;
         case 3: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 9: break;
+        case 10: break;
+        case 6: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 11: break;
         case 1: 
           { add(yytext());
           }
-        case 10: break;
+        case 12: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -30,8 +29,14 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 %%
@@ -46,10 +51,11 @@
 // default
 
 @ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
 . { add(yytext()); }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -64,7 +70,17 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
 		}
 	}
 }
@@ -74,7 +90,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+/* The following code was generated by JFlex 1.4.3 on 03.08.11 18:24 */
 
 /*
  * Normalization rules for French text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:03 from the specification file
+ * on 03.08.11 18:24 from the specification file
  * <tt>MpdlNormalizerLexFR.lex</tt>
  */
 public class MpdlNormalizerLexFR {
@@ -41,16 +40,18 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  1,  1,  3, 3
+     0,  0,  1,  1,  2,  2,  3,  3,  4, 4
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\15\32\2\6\0\32\2\144\0\1\4\3\7"+
-    "\3\0\1\5\1\0\3\10\1\0\3\11\3\0\3\12\4\0\3\13"+
-    "\126\0\2\6\53\0\1\3\u1e99\0\1\14\udfe6\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+
+    "\32\4\6\0\1\5\2\4\1\5\20\4\1\5\5\4\1\1\1\0"+
+    "\1\1\141\0\1\7\3\12\3\0\1\10\1\0\3\13\1\0\3\14"+
+    "\3\0\3\15\4\0\3\16\126\0\2\11\53\0\1\6\u1e99\0\1\17"+
+    "\udfe6\0";
 
   /** 
    * Translates characters to character classes
@@ -63,11 +64,12 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
-    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16";
+    "\5\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\2\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\1\17";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[18];
+    int [] result = new int[22];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -92,12 +94,12 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\16\0\34\0\52\0\70\0\70\0\70\0\70"+
-    "\0\70\0\70\0\70\0\70\0\70\0\70\0\70\0\70"+
-    "\0\70\0\70";
+    "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\146\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[18];
+    int [] result = new int[22];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -120,14 +122,17 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\5\1\0\13\5\1\6\1\5\1\7\1\5\1\10"+
-    "\1\11\1\12\7\5\1\6\1\5\1\13\1\5\1\10"+
-    "\1\11\1\12\7\5\2\6\1\13\1\5\1\10\1\11"+
-    "\1\12\1\14\1\15\1\16\1\17\1\20\1\21\1\22"+
-    "\1\6\16\0";
+    "\1\6\1\7\1\6\1\0\1\6\1\7\12\6\1\10"+
+    "\1\6\1\7\1\6\1\11\1\6\1\7\1\12\1\13"+
+    "\1\14\7\6\1\10\1\6\1\7\1\6\1\15\1\6"+
+    "\1\7\1\12\1\13\1\14\7\6\1\10\1\6\1\7"+
+    "\1\6\1\16\1\6\1\7\1\12\1\13\1\14\7\6"+
+    "\2\10\1\17\1\10\1\15\1\6\1\7\1\12\1\13"+
+    "\1\14\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+
+    "\1\10\23\0\1\6\16\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[70];
+    int [] result = new int[119];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -165,10 +170,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\4\0\16\11";
+    "\5\0\1\11\1\1\7\11\1\1\7\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[18];
+    int [] result = new int[22];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -245,6 +250,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -276,7 +283,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 54) {
+    while (i < 82) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -546,65 +553,72 @@
         case 2: 
           { problem = 1; add(yytext());
           }
-        case 15: break;
+        case 16: break;
         case 6: 
           { add("ae");
           }
-        case 16: break;
+        case 17: break;
         case 4: 
           { add("s");
           }
-        case 17: break;
-        case 12: 
+        case 18: break;
+        case 13: 
           { add("o");
           }
-        case 18: break;
+        case 19: break;
         case 3: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 19: break;
-        case 13: 
+        case 20: break;
+        case 8: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 21: break;
+        case 14: 
           { add("u");
           }
-        case 20: break;
+        case 22: break;
         case 1: 
           { add(yytext());
           }
-        case 21: break;
-        case 11: 
+        case 23: break;
+        case 12: 
           { add("i");
           }
-        case 22: break;
-        case 14: 
+        case 24: break;
+        case 15: 
           { add("");
           }
-        case 23: break;
-        case 10: 
+        case 25: break;
+        case 11: 
           { add("e");
           }
-        case 24: break;
-        case 9: 
+        case 26: break;
+        case 10: 
           { add("a");
           }
-        case 25: break;
+        case 27: break;
+        case 9: 
+          { add("oe");
+          }
+        case 28: break;
         case 5: 
           { add("ss");
           }
-        case 26: break;
-        case 8: 
-          { add("oe");
-          }
-        case 27: break;
+        case 29: break;
         case 7: 
           { switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
           }
-        case 28: break;
+        case 30: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -31,8 +30,14 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
@@ -66,10 +71,11 @@
 // default
 
 @ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
 . { add(yytext()); }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -84,18 +90,27 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
 	}
 }
 
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+	}
+}
 
 
 /*
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for Italian text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:03 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexIT.lex</tt>
  */
 public class MpdlNormalizerLexIT {
@@ -47,15 +46,15 @@
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\5\42\0\1\4\22\0\1\51\1\1\3\2\1\1\3\2"+
-    "\1\40\1\0\1\2\1\3\2\2\1\41\1\2\1\47\1\3\1\2"+
-    "\1\37\1\44\1\50\2\2\1\0\1\2\6\0\1\43\3\2\1\11"+
-    "\2\2\1\42\1\6\1\35\1\2\1\3\1\2\1\7\1\36\1\13"+
-    "\1\45\1\12\1\2\1\10\1\15\1\46\2\2\1\0\1\2\62\0"+
-    "\1\4\22\0\1\16\5\0\1\32\1\0\1\17\3\0\1\20\5\0"+
-    "\1\21\6\0\1\22\5\0\1\30\1\23\5\0\1\31\1\0\1\24"+
-    "\3\0\1\25\5\0\1\26\6\0\1\27\37\0\1\1\70\0\1\34"+
-    "\1\33\53\0\1\14\ufe80\0";
+    "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\52\1\1\3\2"+
+    "\1\1\3\2\1\41\1\0\1\2\1\3\2\2\1\42\1\2\1\50"+
+    "\1\3\1\2\1\40\1\45\1\51\2\2\1\0\1\2\6\0\1\44"+
+    "\3\2\1\12\2\2\1\43\1\7\1\36\1\2\1\3\1\2\1\10"+
+    "\1\37\1\14\1\46\1\13\1\2\1\11\1\16\1\47\2\2\1\0"+
+    "\1\2\62\0\1\4\22\0\1\17\5\0\1\33\1\0\1\20\3\0"+
+    "\1\21\5\0\1\22\6\0\1\23\5\0\1\31\1\24\5\0\1\32"+
+    "\1\0\1\25\3\0\1\26\5\0\1\27\6\0\1\30\37\0\1\1"+
+    "\70\0\1\35\1\34\53\0\1\15\ufe80\0";
 
   /** 
    * Translates characters to character classes
@@ -68,17 +67,17 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\11\0\1\1\1\2\2\3\1\4\1\5\1\2\1\3"+
-    "\1\6\1\2\1\7\1\10\1\11\1\12\1\13\5\3"+
-    "\1\14\1\2\1\3\1\6\1\2\1\15\1\16\1\17"+
-    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
-    "\1\30\4\0\1\31\1\32\1\0\1\33\1\0\1\34"+
-    "\1\35\1\0\1\36\1\37\1\40\4\0\1\41\5\0"+
-    "\1\42\1\43\2\0\1\44\1\0\1\45\5\0\1\44"+
-    "\1\46\3\0\1\47";
+    "\11\0\1\1\1\2\2\3\1\1\1\4\1\2\1\3"+
+    "\1\5\1\2\1\6\1\7\1\10\1\11\1\12\5\3"+
+    "\1\13\1\2\1\3\1\5\1\2\1\14\1\15\1\16"+
+    "\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+
+    "\1\27\1\30\4\0\1\31\1\32\1\33\1\0\1\34"+
+    "\1\0\1\35\1\36\1\0\1\37\1\40\1\41\4\0"+
+    "\1\42\6\0\1\43\1\44\4\0\1\45\1\0\1\46"+
+    "\10\0\1\47\4\0\1\45\2\0\1\50";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[89];
+    int [] result = new int[100];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -103,21 +102,22 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\52\0\124\0\176\0\250\0\322\0\374\0\u0126"+
-    "\0\u0150\0\0\0\0\0\0\0\u017a\0\0\0\0\0\u01a4"+
-    "\0\u01ce\0\0\0\u01f8\0\0\0\0\0\0\0\0\0\0"+
-    "\0\u0222\0\u024c\0\u0276\0\u02a0\0\u02ca\0\0\0\u02f4\0\u031e"+
-    "\0\u0348\0\u0372\0\u039c\0\0\0\0\0\0\0\0\0\0"+
-    "\0\0\0\0\0\0\0\0\0\0\0\0\0\u03c6\0\u03f0"+
-    "\0\u041a\0\0\0\0\0\0\0\u0444\0\0\0\u046e\0\0"+
-    "\0\0\0\u0498\0\0\0\0\0\0\0\u04c2\0\u04ec\0\u0516"+
-    "\0\u0540\0\0\0\u056a\0\u0594\0\u05be\0\u05e8\0\u0612\0\0"+
-    "\0\0\0\u063c\0\u031e\0\u0666\0\u0690\0\0\0\u06ba\0\u06e4"+
-    "\0\u070e\0\0\0\u0738\0\0\0\0\0\u0762\0\u078c\0\u07b6"+
-    "\0\0";
+    "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
+    "\0\u0158\0\0\0\0\0\0\0\u0183\0\u01ae\0\0\0\u01d9"+
+    "\0\u0204\0\0\0\u022f\0\0\0\0\0\0\0\0\0\0"+
+    "\0\u025a\0\u0285\0\u02b0\0\u02db\0\u0306\0\0\0\u0331\0\u035c"+
+    "\0\u0387\0\u03b2\0\u03dd\0\0\0\0\0\0\0\0\0\0"+
+    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\u0408"+
+    "\0\u0433\0\u045e\0\u0489\0\0\0\0\0\0\0\u04b4\0\0"+
+    "\0\u04df\0\0\0\0\0\u050a\0\0\0\0\0\0\0\u0535"+
+    "\0\u0560\0\u058b\0\u05b6\0\0\0\u05e1\0\u060c\0\u0637\0\u0662"+
+    "\0\u068d\0\0\0\0\0\0\0\u06b8\0\u06e3\0\u070e\0\u035c"+
+    "\0\u0739\0\u0764\0\0\0\u078f\0\u07ba\0\u07e5\0\0\0\u0810"+
+    "\0\u083b\0\u0866\0\u0891\0\0\0\u08bc\0\u08e7\0\u0912\0\u093d"+
+    "\0\0\0\u0968\0\u0993\0\0";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[89];
+    int [] result = new int[100];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -140,63 +140,67 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\52\0\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
-    "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\5\12"+
-    "\2\13\1\12\2\13\1\24\1\25\1\26\1\27\1\30"+
-    "\1\12\1\13\1\31\2\13\1\14\1\13\1\23\1\32"+
-    "\1\33\1\34\1\35\1\36\1\12\1\13\1\14\1\15"+
-    "\1\16\1\17\1\37\1\14\1\21\1\13\1\15\1\40"+
-    "\1\41\1\42\5\12\2\13\1\12\2\13\1\24\1\25"+
-    "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\43"+
-    "\1\13\1\42\1\32\1\33\1\34\1\35\1\36\1\12"+
-    "\1\13\1\14\1\15\1\16\1\44\1\20\1\14\1\21"+
-    "\1\13\1\15\1\14\1\22\1\23\1\45\1\46\1\47"+
-    "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\24"+
-    "\1\25\1\26\1\27\1\30\1\12\1\13\1\31\2\13"+
-    "\1\14\1\13\1\23\1\32\1\33\1\34\1\35\1\36"+
-    "\1\12\1\13\1\14\1\15\1\16\1\44\1\37\1\14"+
+    "\53\0\1\12\1\13\1\14\1\15\1\16\1\12\1\17"+
+    "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+
+    "\5\12\2\13\1\12\2\13\1\24\1\25\1\26\1\27"+
+    "\1\30\1\12\1\13\1\31\2\13\1\14\1\13\1\23"+
+    "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+
+    "\1\15\1\16\1\12\1\17\1\37\1\14\1\21\1\13"+
+    "\1\15\1\40\1\41\1\42\5\12\2\13\1\12\2\13"+
+    "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+
+    "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+
+    "\1\36\1\12\1\13\1\14\1\15\1\16\1\12\1\44"+
+    "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+
+    "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+
+    "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+
+    "\1\13\1\31\2\13\1\14\1\13\1\23\1\32\1\33"+
+    "\1\34\1\35\1\36\1\12\1\13\1\14\1\15\1\16"+
+    "\1\12\1\44\1\37\1\14\1\21\1\13\1\15\1\40"+
+    "\1\41\1\42\1\45\1\46\1\47\1\50\1\51\1\52"+
+    "\1\53\1\54\1\55\1\56\1\24\1\25\1\26\1\27"+
+    "\1\30\1\12\1\13\1\31\2\13\1\43\1\13\1\42"+
+    "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+
+    "\1\15\1\16\1\12\1\57\1\20\1\14\1\21\1\13"+
+    "\1\15\1\14\1\22\1\23\1\45\1\46\1\47\1\50"+
+    "\1\51\1\52\1\53\1\54\1\55\1\56\1\24\1\25"+
+    "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\14"+
+    "\1\13\1\23\1\32\1\33\1\34\1\35\1\36\1\12"+
+    "\1\13\1\14\1\15\1\16\1\12\1\57\1\37\1\14"+
     "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+
     "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+
     "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+
     "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+
-    "\1\36\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
-    "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\1\45"+
-    "\1\46\1\47\1\50\1\51\1\52\1\53\1\54\1\55"+
-    "\1\56\1\24\1\25\1\26\1\27\1\30\1\12\1\13"+
-    "\1\31\2\13\1\14\1\13\1\23\1\32\1\33\1\34"+
-    "\1\35\1\36\1\12\1\13\1\14\1\15\1\16\1\17"+
-    "\1\37\1\14\1\21\1\13\1\15\1\40\1\41\1\42"+
-    "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+
-    "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+
-    "\1\13\1\31\2\13\1\43\1\13\1\42\1\32\1\33"+
-    "\1\34\1\35\1\36\6\0\1\57\4\0\1\60\1\61"+
-    "\41\0\1\62\113\0\1\63\1\0\1\63\36\0\1\64"+
-    "\22\0\1\65\44\0\1\66\4\0\1\66\2\0\1\66"+
-    "\3\0\1\66\5\0\2\66\1\0\2\66\1\0\3\66"+
-    "\2\0\1\66\1\0\2\66\1\0\2\66\45\0\1\67"+
-    "\57\0\1\70\5\0\2\71\1\72\2\0\2\71\1\0"+
-    "\3\71\13\0\1\71\6\0\1\71\2\0\1\71\2\0"+
-    "\4\71\47\0\1\73\1\0\1\74\3\0\2\75\1\76"+
-    "\2\0\2\75\1\0\3\75\13\0\1\75\6\0\1\75"+
-    "\2\0\1\75\2\0\4\75\10\0\1\77\25\0\1\64"+
-    "\25\0\1\100\51\0\1\100\3\0\1\101\35\0\1\102"+
-    "\4\0\1\102\2\0\1\102\3\0\1\102\5\0\2\102"+
-    "\1\0\2\102\1\0\3\102\2\0\1\102\1\0\2\102"+
-    "\1\0\2\102\43\0\1\103\4\0\1\104\15\0\1\105"+
-    "\53\0\1\106\51\0\1\106\3\0\1\107\72\0\1\110"+
-    "\54\0\1\111\12\0\2\71\3\0\2\71\1\0\3\71"+
-    "\13\0\1\71\6\0\1\71\2\0\1\71\2\0\4\71"+
-    "\3\0\2\75\3\0\2\75\1\0\3\75\13\0\1\75"+
-    "\6\0\1\75\2\0\1\75\2\0\4\75\5\0\1\112"+
-    "\3\0\1\113\53\0\1\114\43\0\1\115\6\0\1\113"+
-    "\43\0\1\116\51\0\1\116\1\117\1\120\46\0\1\121"+
-    "\3\0\1\60\53\0\1\122\43\0\1\123\6\0\1\60"+
-    "\46\0\1\113\45\0\1\124\60\0\1\113\43\0\1\125"+
-    "\50\0\1\126\2\0\1\127\52\0\1\60\54\0\1\60"+
-    "\45\0\1\127\100\0\1\130\20\0\1\131\44\0";
+    "\1\36\7\0\1\60\4\0\1\61\1\62\42\0\1\63"+
+    "\114\0\1\64\1\0\1\64\6\0\1\65\103\0\1\66"+
+    "\23\0\1\67\44\0\1\70\5\0\1\70\2\0\1\70"+
+    "\3\0\1\70\5\0\2\70\1\0\2\70\1\0\3\70"+
+    "\2\0\1\70\1\0\2\70\1\0\2\70\46\0\1\71"+
+    "\60\0\1\72\5\0\2\73\1\74\3\0\2\73\1\0"+
+    "\3\73\13\0\1\73\6\0\1\73\2\0\1\73\2\0"+
+    "\4\73\50\0\1\75\1\0\1\76\3\0\2\77\1\100"+
+    "\3\0\2\77\1\0\3\77\13\0\1\77\6\0\1\77"+
+    "\2\0\1\77\2\0\4\77\11\0\1\101\25\0\1\66"+
+    "\26\0\1\102\52\0\1\102\3\0\1\103\35\0\1\104"+
+    "\5\0\1\104\2\0\1\104\3\0\1\104\5\0\2\104"+
+    "\1\0\2\104\1\0\3\104\2\0\1\104\1\0\2\104"+
+    "\1\0\2\104\44\0\1\105\4\0\1\106\16\0\1\107"+
+    "\54\0\1\110\52\0\1\110\3\0\1\111\40\0\1\112"+
+    "\105\0\1\113\55\0\1\114\15\0\1\115\52\0\1\116"+
+    "\51\0\1\117\4\0\1\120\54\0\1\121\43\0\1\122"+
+    "\7\0\1\120\44\0\1\123\52\0\1\123\1\124\1\125"+
+    "\46\0\1\126\4\0\1\61\54\0\1\127\43\0\1\130"+
+    "\7\0\1\61\40\0\2\73\4\0\2\73\1\0\3\73"+
+    "\13\0\1\73\6\0\1\73\2\0\1\73\2\0\4\73"+
+    "\3\0\2\77\4\0\2\77\1\0\3\77\13\0\1\77"+
+    "\6\0\1\77\2\0\1\77\2\0\4\77\6\0\1\131"+
+    "\51\0\1\132\53\0\1\133\53\0\1\134\50\0\1\135"+
+    "\3\0\1\136\47\0\1\137\52\0\1\140\56\0\1\120"+
+    "\46\0\1\141\61\0\1\120\43\0\1\142\104\0\1\143"+
+    "\24\0\1\61\55\0\1\61\46\0\1\136\50\0\1\144"+
+    "\44\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[2016];
+    int [] result = new int[2494];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -234,14 +238,14 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\1\10\7\0\1\1\3\11\1\1\2\11\2\1\1\11"+
-    "\1\1\5\11\5\1\1\11\5\1\13\11\3\0\3\11"+
+    "\1\10\7\0\1\1\3\11\2\1\1\11\2\1\1\11"+
+    "\1\1\5\11\5\1\1\11\5\1\14\11\4\0\3\11"+
     "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+
-    "\5\0\2\11\2\0\1\1\1\0\1\11\3\0\1\11"+
-    "\1\0\2\11\3\0\1\11";
+    "\5\0\3\11\4\0\1\1\1\0\1\11\3\0\1\11"+
+    "\4\0\1\11\4\0\1\11\2\0\1\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[89];
+    int [] result = new int[100];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -325,6 +329,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -356,7 +362,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 168) {
+    while (i < 172) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -656,86 +662,97 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 32: 
+        case 33: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = CONS; add("U");
           }
-        case 40: break;
-        case 15: 
+        case 41: break;
+        case 14: 
           { add("Á");
           }
-        case 41: break;
-        case 39: 
+        case 42: break;
+        case 40: 
           // lookahead expression with fixed lookahead length
           yypushback(1);
           { add(yytext());
           }
-        case 42: break;
-        case 38: 
+        case 43: break;
+        case 39: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 3;
           { add(yytext());
           }
-        case 43: break;
-        case 37: 
+        case 44: break;
+        case 38: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add(yytext());
           }
-        case 44: break;
-        case 4: 
+        case 45: break;
+        case 26: 
           { add(yytext());
           }
-        case 45: break;
-        case 22: 
+        case 46: break;
+        case 21: 
           { add("í");
           }
-        case 46: break;
-        case 9: 
+        case 47: break;
+        case 8: 
           { cv = VOWEL; add("AE");
           }
-        case 47: break;
-        case 5: 
+        case 48: break;
+        case 11: 
+          { problem = 1; cv = 0; add(yytext());
+          }
+        case 49: break;
+        case 4: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 48: break;
-        case 29: 
+        case 50: break;
+        case 30: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = CONS; add("u");
           }
-        case 49: break;
-        case 20: 
+        case 51: break;
+        case 19: 
           { add("á");
           }
-        case 50: break;
+        case 52: break;
         case 1: 
           { cv = 0; add(yytext());
           }
-        case 51: break;
-        case 33: 
+        case 53: break;
+        case 24: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 54: break;
+        case 34: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
           }
-        case 52: break;
-        case 34: 
+        case 55: break;
+        case 35: 
           { cv = VOWEL; add("zio");
           }
-        case 53: break;
-        case 11: 
+        case 56: break;
+        case 10: 
           { cv = VOWEL; add("OE");
           }
-        case 54: break;
-        case 19: 
+        case 57: break;
+        case 18: 
           { add("Ú");
           }
-        case 55: break;
-        case 36: 
+        case 58: break;
+        case 37: 
           // general lookahead, find correct zzMarkedPos
           { int zzFState = 7;
             int zzFPos = zzStartRead;
@@ -758,20 +775,20 @@
           }
           { cv = VOWEL; add(yytext().replace("ſ", "s"));
           }
-        case 56: break;
+        case 59: break;
         case 3: 
           { cv = CONS; add(yytext());
           }
-        case 57: break;
-        case 31: 
+        case 60: break;
+        case 32: 
           { cv = CONS; add("QU");
           }
-        case 58: break;
-        case 16: 
+        case 61: break;
+        case 15: 
           { add("É");
           }
-        case 59: break;
-        case 27: 
+        case 62: break;
+        case 28: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { switch(cv) {
@@ -779,85 +796,81 @@
 			default: cv = VOWEL; add(yytext()); break;
 		}
           }
-        case 60: break;
-        case 7: 
+        case 63: break;
+        case 6: 
           { cv = CONS; add("ss");
           }
-        case 61: break;
-        case 6: 
+        case 64: break;
+        case 5: 
           { cv = CONS; add("s");
           }
-        case 62: break;
-        case 35: 
+        case 65: break;
+        case 13: 
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized.replaceAll(LB, "");
+		}
+          }
+        case 66: break;
+        case 36: 
           { cv = VOWEL; add("ZIO");
           }
-        case 63: break;
+        case 67: break;
         case 2: 
           { cv = VOWEL; add(yytext());
           }
-        case 64: break;
-        case 18: 
+        case 68: break;
+        case 17: 
           { add("Ó");
           }
-        case 65: break;
-        case 24: 
-          { add("ú");
-          }
-        case 66: break;
-        case 30: 
-          { cv = CONS; add("Qu");
-          }
-        case 67: break;
-        case 21: 
-          { add("é");
-          }
-        case 68: break;
-        case 8: 
-          { cv = VOWEL; add("ae");
-          }
         case 69: break;
-        case 14: 
-          { switch (problem) {
-			case 1: return "";
-			default: return normalized;
-		}
+        case 23: 
+          { add("ú");
           }
         case 70: break;
-        case 13: 
+        case 31: 
+          { cv = CONS; add("Qu");
+          }
+        case 71: break;
+        case 20: 
+          { add("é");
+          }
+        case 72: break;
+        case 7: 
+          { cv = VOWEL; add("ae");
+          }
+        case 73: break;
+        case 12: 
           { add("");
           }
-        case 71: break;
-        case 23: 
+        case 74: break;
+        case 22: 
           { add("ó");
           }
-        case 72: break;
-        case 10: 
+        case 75: break;
+        case 9: 
           { cv = VOWEL; add("oe");
           }
-        case 73: break;
-        case 28: 
+        case 76: break;
+        case 29: 
           { cv = CONS; add("qu");
           }
-        case 74: break;
-        case 12: 
-          { problem = 1; add(yytext());
-          }
-        case 75: break;
+        case 77: break;
         case 25: 
           { switch(cv) {
 			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
 			default: cv = CONS; add(yytext()); break;
 		}
           }
-        case 76: break;
-        case 26: 
+        case 78: break;
+        case 27: 
           { cv = VOWEL; add("ii");
           }
-        case 77: break;
-        case 17: 
+        case 79: break;
+        case 16: 
           { add("Í");
           }
-        case 78: break;
+        case 80: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -34,6 +33,8 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
 Vowel = [AEIOUaeiouÆæęàèòùœ]
@@ -42,11 +43,12 @@
 
 
 hyphen = [\u002d\u00ad]  // hyphen and soft hyphen
-X = {hyphen}?
+LB = {hyphen} \u0020
+lb = ({hyphen} \u0020)?
 
 END = \n
 
-prefixCons = (in{X}ter | per | ſu{X}per | ſer)
+prefixCons = (in{lb}ter | per | ſu{lb}per | ſer)
 
 %%
 
@@ -82,7 +84,7 @@
 // h-Regeln aus Arboreal:
 ^ ha / {END} { add(yytext()); }
 ^ hai / {END} { add(yytext()); }
-^ han{X}no / {END} { add(yytext()); }
+^ han{lb}no / {END} { add(yytext()); }
 ^ ho / {END} { add(yytext()); }
 ^ h { add(""); }
 
@@ -91,7 +93,7 @@
 
 // 1. rules for u --> v
 
-^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); }
+^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); }
 
 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
 
@@ -116,21 +118,21 @@
 		}
 	}
 
-v / {X} {Cons} { cv = CONS; add("u"); }
-V / {X} {Cons} { cv = CONS; add("U"); }
+v / {lb} {Cons} { cv = CONS; add("u"); }
+V / {lb} {Cons} { cv = CONS; add("U"); }
 
 // 3. override default rule for .
 
 {Vowel} { cv = VOWEL; add(yytext()); }
 {Cons} { cv = CONS; add(yytext()); }
-{hyphen} { add(yytext()); }
-@ { problem = 1; add(yytext()); }
+@ { problem = 1; cv = 0; add(yytext()); }
+{LB} { add(yytext()); }
 . { cv = 0; add(yytext()); } 
 
 }
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -145,7 +147,17 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
 		}
 	}
 }
@@ -155,7 +167,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for Latin text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:04 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexLA.lex</tt>
  */
 public class MpdlNormalizerLexLA {
@@ -43,23 +42,23 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  2,  3,  4,  1,  2,  1,  2,  3,  4,  1, 2
+     0,  0,  1,  2,  3,  4,  5,  6,  1,  2,  3,  4,  5, 6
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\5\42\0\1\4\23\0\1\1\3\2\1\1\2\2\1\52"+
-    "\1\1\1\0\1\2\1\3\2\2\1\1\1\2\1\45\1\3\2\2"+
-    "\1\63\1\64\2\2\1\0\1\2\6\0\1\56\1\2\1\46\1\42"+
-    "\1\10\2\2\1\50\1\13\1\26\1\2\1\47\1\37\1\12\1\60"+
-    "\1\16\1\6\1\15\1\31\1\14\1\7\1\11\2\2\1\0\1\2"+
-    "\62\0\1\4\30\0\1\24\30\0\1\22\1\36\1\30\1\54\3\0"+
-    "\1\23\1\0\1\40\1\32\1\0\1\57\1\44\1\33\1\51\1\61"+
-    "\2\0\1\41\1\34\1\53\4\0\1\43\1\35\1\55\1\62\34\0"+
-    "\1\23\71\0\1\25\53\0\1\17\u0181\0\1\27\ud4fe\0\1\20\u0590\0"+
-    "\1\21\u226e\0";
+    "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\0\1\1\3\2"+
+    "\1\1\2\2\1\53\1\1\1\0\1\2\1\3\2\2\1\1\1\2"+
+    "\1\46\1\3\2\2\1\64\1\65\2\2\1\66\1\2\6\0\1\57"+
+    "\1\2\1\47\1\43\1\11\2\2\1\51\1\14\1\27\1\2\1\50"+
+    "\1\40\1\13\1\61\1\17\1\7\1\16\1\32\1\15\1\10\1\12"+
+    "\2\2\1\66\1\2\62\0\1\4\30\0\1\25\30\0\1\23\1\37"+
+    "\1\31\1\55\3\0\1\24\1\0\1\41\1\33\1\0\1\60\1\45"+
+    "\1\34\1\52\1\62\2\0\1\42\1\35\1\54\4\0\1\44\1\36"+
+    "\1\56\1\63\34\0\1\24\71\0\1\26\53\0\1\20\u0181\0\1\30"+
+    "\ud4fe\0\1\21\u0590\0\1\22\u226e\0";
 
   /** 
    * Translates characters to character classes
@@ -72,20 +71,21 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\10\0\1\1\1\2\2\3\1\4\1\5\1\3\1\2"+
-    "\1\3\1\2\1\6\1\1\1\7\1\10\1\11\1\12"+
-    "\11\1\1\3\2\1\3\2\2\3\2\2\1\3\1\6"+
-    "\3\3\1\1\1\2\1\13\4\0\1\14\1\15\1\16"+
-    "\1\0\1\17\1\20\1\21\1\22\1\0\1\23\20\0"+
-    "\1\24\3\0\1\25\3\0\1\26\1\0\1\27\3\0"+
-    "\1\30\1\31\1\32\1\0\1\33\1\34\2\0\1\35"+
-    "\16\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
-    "\1\42\1\43\1\44\1\0\1\45\1\0\1\46\1\0"+
-    "\1\47\1\0\1\50\3\0\1\51\10\0\1\52\6\0"+
-    "\1\53\1\51\1\54\1\55\1\56\1\57\5\0";
+    "\12\0\1\1\1\2\2\3\1\1\1\4\1\3\1\2"+
+    "\1\3\1\2\1\5\1\1\1\6\1\7\1\10\1\11"+
+    "\11\1\1\3\2\1\3\2\1\3\1\12\1\3\2\2"+
+    "\1\3\1\5\3\3\1\1\1\2\1\13\1\14\4\0"+
+    "\1\15\1\16\1\17\1\20\1\0\1\21\1\22\1\23"+
+    "\1\24\1\0\1\25\20\0\1\26\3\0\1\27\3\0"+
+    "\1\30\1\0\1\31\3\0\1\32\1\33\1\34\1\0"+
+    "\1\35\1\36\2\0\1\37\20\0\1\40\1\0\1\41"+
+    "\1\0\1\42\1\0\1\43\1\44\1\45\1\46\1\0"+
+    "\1\47\1\0\1\50\1\0\1\51\1\0\1\52\4\0"+
+    "\1\53\10\0\1\54\6\0\1\55\3\0\1\56\1\57"+
+    "\1\60\2\0\1\61\5\0\1\53";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[166];
+    int [] result = new int[179];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -110,30 +110,32 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\65\0\152\0\237\0\324\0\u0109\0\u013e\0\u0173"+
-    "\0\u01a8\0\u01a8\0\u01a8\0\u01dd\0\u01a8\0\u01a8\0\u0212\0\u0247"+
-    "\0\u027c\0\u02b1\0\u01a8\0\u0173\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
-    "\0\u02e6\0\u031b\0\u0350\0\u0385\0\u03ba\0\u03ef\0\u0424\0\u0459"+
-    "\0\u048e\0\u04c3\0\u04f8\0\u052d\0\u0562\0\u0597\0\u05cc\0\u0601"+
-    "\0\u0636\0\u066b\0\u06a0\0\u06d5\0\u070a\0\u073f\0\u0774\0\u07a9"+
-    "\0\u07de\0\u0813\0\u01a8\0\u0848\0\u087d\0\u08b2\0\u01a8\0\u01a8"+
-    "\0\u01a8\0\u01a8\0\u08e7\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u091c"+
-    "\0\u01a8\0\u0951\0\u0986\0\u09bb\0\u09f0\0\u0a25\0\u0a5a\0\u0a8f"+
-    "\0\u0ac4\0\u0af9\0\u0b2e\0\u0b63\0\u0b98\0\u0bcd\0\u0c02\0\u0c37"+
-    "\0\u0c6c\0\u01a8\0\u0ca1\0\u0cd6\0\u0d0b\0\u01a8\0\u0d40\0\u0d75"+
-    "\0\u0daa\0\u01a8\0\u0ddf\0\u01a8\0\u0e14\0\u0e49\0\u0e7e\0\u01a8"+
-    "\0\u01a8\0\u01a8\0\u0eb3\0\u01a8\0\u01a8\0\u0ee8\0\u0f1d\0\u01a8"+
-    "\0\u0f52\0\u0f87\0\u0fbc\0\u0ff1\0\u1026\0\u105b\0\u1090\0\u10c5"+
-    "\0\u10fa\0\u112f\0\u1164\0\u1199\0\u11ce\0\u07de\0\u01a8\0\u1203"+
-    "\0\u01a8\0\u1238\0\u01a8\0\u126d\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
-    "\0\u12a2\0\u01a8\0\u12d7\0\u01a8\0\u130c\0\u01a8\0\u1341\0\u01a8"+
-    "\0\u1376\0\u13ab\0\u06d5\0\u13e0\0\u1415\0\u144a\0\u147f\0\u14b4"+
-    "\0\u14e9\0\u01a8\0\u151e\0\u1553\0\u01a8\0\u1588\0\u15bd\0\u15f2"+
-    "\0\u1627\0\u165c\0\u1691\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
-    "\0\u01a8\0\u16c6\0\u16fb\0\u1730\0\u1765\0\u179a";
+    "\0\0\0\67\0\156\0\245\0\334\0\u0113\0\u014a\0\u0181"+
+    "\0\u01b8\0\u01ef\0\u0226\0\u0226\0\u0226\0\u025d\0\u0294\0\u0226"+
+    "\0\u02cb\0\u0302\0\u0339\0\u0370\0\u0226\0\u01ef\0\u0226\0\u0226"+
+    "\0\u0226\0\u0226\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba"+
+    "\0\u04f1\0\u0528\0\u055f\0\u0596\0\u05cd\0\u0604\0\u063b\0\u0672"+
+    "\0\u06a9\0\u06e0\0\u0226\0\u0717\0\u074e\0\u0785\0\u07bc\0\u07f3"+
+    "\0\u082a\0\u0861\0\u0898\0\u08cf\0\u0906\0\u0226\0\u0226\0\u093d"+
+    "\0\u0974\0\u09ab\0\u09e2\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a19"+
+    "\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a50\0\u0226\0\u0a87\0\u0abe"+
+    "\0\u0af5\0\u0b2c\0\u0b63\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\u0c76"+
+    "\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\u0dc0\0\u0226\0\u0df7"+
+    "\0\u0e2e\0\u0e65\0\u0226\0\u0e9c\0\u0ed3\0\u0f0a\0\u0226\0\u0f41"+
+    "\0\u0226\0\u0f78\0\u0faf\0\u0fe6\0\u0226\0\u0226\0\u0226\0\u101d"+
+    "\0\u0226\0\u0226\0\u1054\0\u108b\0\u0226\0\u10c2\0\u10f9\0\u1130"+
+    "\0\u1167\0\u119e\0\u11d5\0\u120c\0\u1243\0\u127a\0\u0226\0\u12b1"+
+    "\0\u12e8\0\u131f\0\u1356\0\u138d\0\u08cf\0\u0226\0\u13c4\0\u0226"+
+    "\0\u13fb\0\u0226\0\u1432\0\u0226\0\u0226\0\u0226\0\u0226\0\u1469"+
+    "\0\u0226\0\u14a0\0\u0226\0\u14d7\0\u0226\0\u150e\0\u0226\0\u1545"+
+    "\0\u157c\0\u15b3\0\u07bc\0\u15ea\0\u1621\0\u1658\0\u168f\0\u16c6"+
+    "\0\u16fd\0\u0226\0\u1734\0\u176b\0\u0226\0\u17a2\0\u17d9\0\u1810"+
+    "\0\u1847\0\u187e\0\u18b5\0\u0226\0\u18ec\0\u1923\0\u195a\0\u0226"+
+    "\0\u0226\0\u0226\0\u1991\0\u19c8\0\u0226\0\u19ff\0\u1a36\0\u1a6d"+
+    "\0\u1aa4\0\u1adb\0\u0226";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[166];
+    int [] result = new int[179];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -156,92 +158,110 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\20\0\1\10\44\0\1\11\1\12\1\13\1\14\1\15"+
-    "\1\16\1\17\1\20\1\12\1\21\1\13\1\22\1\13"+
-    "\1\14\1\13\1\23\1\24\1\11\1\25\1\26\1\27"+
-    "\1\30\2\11\1\31\1\13\1\32\1\33\1\34\1\35"+
-    "\1\36\1\13\1\37\1\40\1\13\1\41\1\11\1\42"+
-    "\1\13\1\14\1\13\1\11\1\13\1\11\1\43\1\44"+
-    "\1\45\1\11\1\46\2\11\1\47\1\50\1\11\1\12"+
-    "\1\13\1\14\1\15\1\16\1\51\1\52\1\12\1\21"+
-    "\1\13\1\53\1\13\1\14\1\54\1\55\1\24\1\11"+
-    "\1\25\1\26\1\27\1\30\2\11\1\31\1\13\1\32"+
-    "\1\33\1\34\1\35\1\36\1\13\1\37\1\40\1\13"+
-    "\1\41\1\11\1\56\1\13\1\14\1\57\1\11\1\60"+
-    "\1\61\1\43\1\44\1\45\1\11\1\46\2\11\1\62"+
-    "\1\50\1\11\1\12\1\13\1\14\1\15\1\63\1\17"+
-    "\1\20\1\12\1\21\1\13\1\22\1\13\1\14\1\13"+
-    "\1\23\1\24\1\11\1\25\1\26\1\27\1\30\2\11"+
-    "\1\31\1\13\1\32\1\33\1\34\1\35\1\36\1\13"+
-    "\1\37\1\40\1\13\1\41\1\11\1\42\1\13\1\14"+
-    "\1\13\1\11\1\13\1\11\1\43\1\44\1\45\1\11"+
-    "\1\46\2\11\1\47\1\50\1\11\1\12\1\13\1\14"+
-    "\1\15\1\63\1\51\1\52\1\12\1\21\1\13\1\53"+
-    "\1\13\1\14\1\54\1\55\1\24\1\11\1\25\1\26"+
-    "\1\27\1\30\2\11\1\31\1\13\1\32\1\33\1\34"+
-    "\1\35\1\36\1\13\1\37\1\40\1\13\1\41\1\11"+
-    "\1\56\1\13\1\14\1\57\1\11\1\60\1\61\1\43"+
-    "\1\44\1\45\1\11\1\46\2\11\1\62\1\50\13\0"+
-    "\1\64\2\0\1\65\1\66\51\0\1\67\101\0\1\70"+
-    "\141\0\1\71\52\0\1\71\11\0\1\72\15\0\1\73"+
-    "\36\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+
-    "\1\0\1\74\1\75\1\76\1\74\3\0\2\77\1\100"+
-    "\1\0\1\77\2\0\2\77\1\0\4\77\2\0\1\77"+
-    "\6\0\1\77\5\0\1\77\2\0\1\77\2\0\4\77"+
-    "\1\0\1\77\11\0\1\77\26\0\1\101\44\0\1\102"+
-    "\2\0\2\103\1\0\2\104\13\0\1\104\5\0\1\104"+
-    "\33\0\1\105\2\0\2\106\1\0\2\107\13\0\1\107"+
-    "\5\0\1\107\33\0\1\110\2\0\2\111\1\0\2\112"+
-    "\13\0\1\112\5\0\1\112\33\0\1\113\2\0\2\114"+
-    "\1\0\2\115\13\0\1\115\5\0\1\115\33\0\1\116"+
-    "\1\0\1\117\2\120\1\0\2\121\13\0\1\121\5\0"+
-    "\1\121\32\0\1\122\1\102\22\0\1\123\5\0\1\124"+
-    "\6\0\1\125\23\0\1\126\1\105\5\0\1\127\1\130"+
-    "\13\0\1\131\40\0\1\132\1\113\33\0\1\133\27\0"+
-    "\1\134\23\0\1\135\5\0\1\136\7\0\1\137\26\0"+
-    "\1\140\52\0\1\141\5\0\1\122\1\102\6\0\1\142"+
-    "\100\0\1\143\112\0\1\26\64\0\1\30\1\0\1\144"+
-    "\4\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+
-    "\1\0\1\74\2\0\1\74\3\0\2\145\1\146\1\0"+
-    "\1\145\2\0\2\145\1\0\4\145\2\0\1\145\6\0"+
-    "\1\145\5\0\1\145\2\0\1\145\2\0\4\145\1\0"+
-    "\1\145\11\0\1\145\7\0\1\147\1\0\1\72\15\0"+
-    "\1\73\36\0\1\150\5\0\2\150\2\0\1\150\42\0"+
-    "\1\150\1\0\1\150\1\75\1\76\1\150\13\0\1\151"+
-    "\13\0\1\101\46\0\1\152\63\0\1\153\1\152\63\0"+
-    "\1\154\1\0\1\140\52\0\1\141\51\0\1\155\64\0"+
-    "\1\156\20\0\1\132\60\0\1\150\5\0\2\150\2\0"+
-    "\1\150\42\0\1\150\1\0\1\150\2\0\1\150\13\0"+
-    "\1\157\62\0\1\160\63\0\1\161\1\160\63\0\1\162"+
-    "\57\0\2\77\2\0\1\77\2\0\2\77\1\0\4\77"+
-    "\2\0\1\77\6\0\1\77\5\0\1\77\2\0\1\77"+
-    "\2\0\4\77\1\0\1\77\11\0\1\77\7\0\1\103"+
-    "\65\0\1\163\62\0\1\102\2\0\2\103\61\0\1\106"+
-    "\65\0\1\164\62\0\1\105\2\0\2\106\61\0\1\111"+
-    "\65\0\1\165\62\0\1\110\2\0\2\111\61\0\1\114"+
-    "\65\0\1\166\62\0\1\113\2\0\2\114\61\0\1\120"+
-    "\62\0\1\167\67\0\1\170\62\0\1\116\2\0\2\120"+
-    "\57\0\1\171\1\172\63\0\1\173\1\174\63\0\1\175"+
-    "\64\0\1\176\64\0\1\177\64\0\1\200\1\201\63\0"+
-    "\1\202\1\203\63\0\1\204\1\205\63\0\1\206\1\207"+
-    "\63\0\1\210\64\0\1\204\61\0\2\145\2\0\1\145"+
-    "\2\0\2\145\1\0\4\145\2\0\1\145\6\0\1\145"+
-    "\5\0\1\145\2\0\1\145\2\0\4\145\1\0\1\145"+
-    "\11\0\1\145\44\0\1\211\24\0\1\212\7\0\1\213"+
-    "\65\0\1\214\53\0\1\215\11\0\1\213\112\0\1\216"+
-    "\66\0\1\217\64\0\1\220\22\0\1\221\7\0\1\65"+
-    "\65\0\1\222\53\0\1\223\11\0\1\65\56\0\1\224"+
-    "\61\0\1\122\64\0\1\126\64\0\1\225\64\0\1\134"+
-    "\66\0\1\226\64\0\1\227\64\0\1\230\64\0\1\231"+
-    "\64\0\1\232\64\0\1\233\62\0\1\234\73\0\1\213"+
-    "\54\0\1\235\76\0\1\213\53\0\1\236\64\0\1\237"+
-    "\64\0\1\240\73\0\1\65\66\0\1\65\53\0\1\241"+
-    "\67\0\1\242\64\0\1\243\64\0\1\244\64\0\1\245"+
-    "\64\0\1\143\64\0\1\246\61\0\1\171\64\0\1\173"+
-    "\64\0\1\200\64\0\1\202\64\0\1\206\57\0";
+    "\21\0\1\12\45\0\1\13\1\14\1\15\1\16\1\17"+
+    "\1\13\1\20\1\21\1\22\1\14\1\23\1\15\1\24"+
+    "\1\15\1\16\1\15\1\25\1\26\1\13\1\27\1\30"+
+    "\1\31\1\32\2\13\1\33\1\15\1\34\1\35\1\36"+
+    "\1\37\1\40\1\15\1\41\1\42\1\15\1\43\1\13"+
+    "\1\44\1\15\1\16\1\15\1\13\1\15\1\13\1\45"+
+    "\1\46\1\47\1\13\1\50\2\13\1\51\1\52\1\53"+
+    "\1\13\1\14\1\15\1\16\1\17\1\13\1\20\1\54"+
+    "\1\55\1\14\1\23\1\15\1\56\1\15\1\16\1\57"+
+    "\1\60\1\26\1\13\1\27\1\30\1\31\1\32\2\13"+
+    "\1\33\1\15\1\34\1\35\1\36\1\37\1\40\1\15"+
+    "\1\41\1\42\1\15\1\43\1\13\1\61\1\15\1\16"+
+    "\1\62\1\13\1\63\1\64\1\45\1\46\1\47\1\13"+
+    "\1\50\2\13\1\65\1\52\1\53\1\13\1\14\1\15"+
+    "\1\16\1\17\1\13\1\66\1\21\1\22\1\14\1\23"+
+    "\1\15\1\24\1\15\1\16\1\15\1\25\1\26\1\13"+
+    "\1\27\1\30\1\31\1\32\2\13\1\33\1\15\1\34"+
+    "\1\35\1\36\1\37\1\40\1\15\1\41\1\42\1\15"+
+    "\1\43\1\13\1\44\1\15\1\16\1\15\1\13\1\15"+
+    "\1\13\1\45\1\46\1\47\1\13\1\50\2\13\1\51"+
+    "\1\52\1\53\1\13\1\14\1\15\1\16\1\17\1\13"+
+    "\1\66\1\54\1\55\1\14\1\23\1\15\1\56\1\15"+
+    "\1\16\1\57\1\60\1\26\1\13\1\27\1\30\1\31"+
+    "\1\32\2\13\1\33\1\15\1\34\1\35\1\36\1\37"+
+    "\1\40\1\15\1\41\1\42\1\15\1\43\1\13\1\61"+
+    "\1\15\1\16\1\62\1\13\1\63\1\64\1\45\1\46"+
+    "\1\47\1\13\1\50\2\13\1\65\1\52\1\53\1\13"+
+    "\1\14\1\15\1\16\1\17\1\13\1\67\1\21\1\22"+
+    "\1\14\1\23\1\15\1\24\1\15\1\16\1\15\1\25"+
+    "\1\26\1\13\1\27\1\30\1\31\1\32\2\13\1\33"+
+    "\1\15\1\34\1\35\1\36\1\37\1\40\1\15\1\41"+
+    "\1\42\1\15\1\43\1\13\1\44\1\15\1\16\1\15"+
+    "\1\13\1\15\1\13\1\45\1\46\1\47\1\13\1\50"+
+    "\2\13\1\51\1\52\1\53\1\13\1\14\1\15\1\16"+
+    "\1\17\1\13\1\67\1\54\1\55\1\14\1\23\1\15"+
+    "\1\56\1\15\1\16\1\57\1\60\1\26\1\13\1\27"+
+    "\1\30\1\31\1\32\2\13\1\33\1\15\1\34\1\35"+
+    "\1\36\1\37\1\40\1\15\1\41\1\42\1\15\1\43"+
+    "\1\13\1\61\1\15\1\16\1\62\1\13\1\63\1\64"+
+    "\1\45\1\46\1\47\1\13\1\50\2\13\1\65\1\52"+
+    "\1\53\14\0\1\70\2\0\1\71\1\72\53\0\1\73"+
+    "\103\0\1\74\145\0\1\75\52\0\1\75\6\0\1\76"+
+    "\73\0\1\77\15\0\1\100\37\0\1\101\6\0\2\101"+
+    "\2\0\1\101\7\0\3\101\30\0\1\101\1\0\1\101"+
+    "\1\102\1\103\1\101\4\0\2\104\1\105\2\0\1\104"+
+    "\2\0\2\104\1\0\4\104\2\0\1\104\6\0\1\104"+
+    "\5\0\1\104\2\0\1\104\2\0\4\104\1\0\1\104"+
+    "\11\0\1\104\30\0\1\106\46\0\1\107\2\0\2\110"+
+    "\1\0\2\111\13\0\1\111\5\0\1\111\35\0\1\112"+
+    "\2\0\2\113\1\0\2\114\13\0\1\114\5\0\1\114"+
+    "\35\0\1\115\2\0\2\116\1\0\2\117\13\0\1\117"+
+    "\5\0\1\117\35\0\1\120\2\0\2\121\1\0\2\122"+
+    "\13\0\1\122\5\0\1\122\35\0\1\123\1\0\1\124"+
+    "\2\125\1\0\2\126\13\0\1\126\5\0\1\126\34\0"+
+    "\1\127\1\107\22\0\1\130\5\0\1\131\6\0\1\132"+
+    "\25\0\1\133\1\112\5\0\1\134\1\135\13\0\1\136"+
+    "\42\0\1\137\1\120\33\0\1\140\31\0\1\141\23\0"+
+    "\1\142\5\0\1\143\7\0\1\144\30\0\1\145\52\0"+
+    "\1\146\7\0\1\127\1\107\6\0\1\147\102\0\1\150"+
+    "\114\0\1\30\66\0\1\32\1\0\1\151\5\0\1\101"+
+    "\6\0\2\101\2\0\1\101\7\0\3\101\30\0\1\101"+
+    "\1\0\1\101\2\0\1\101\4\0\2\152\1\153\2\0"+
+    "\1\152\2\0\2\152\1\0\4\152\2\0\1\152\6\0"+
+    "\1\152\5\0\1\152\2\0\1\152\2\0\4\152\1\0"+
+    "\1\152\11\0\1\152\11\0\1\154\1\0\1\77\15\0"+
+    "\1\100\37\0\1\155\6\0\2\155\2\0\1\155\7\0"+
+    "\3\155\30\0\1\155\1\0\1\155\1\102\1\103\1\155"+
+    "\15\0\1\156\13\0\1\106\50\0\1\157\65\0\1\160"+
+    "\1\157\65\0\1\161\1\0\1\145\52\0\1\146\53\0"+
+    "\1\162\66\0\1\163\22\0\1\137\61\0\1\155\6\0"+
+    "\2\155\2\0\1\155\7\0\3\155\30\0\1\155\1\0"+
+    "\1\155\2\0\1\155\15\0\1\164\64\0\1\165\65\0"+
+    "\1\166\1\165\61\0\1\167\72\0\1\170\63\0\1\171"+
+    "\71\0\1\110\67\0\1\172\64\0\1\107\2\0\2\110"+
+    "\63\0\1\113\67\0\1\173\64\0\1\112\2\0\2\113"+
+    "\63\0\1\116\67\0\1\174\64\0\1\115\2\0\2\116"+
+    "\63\0\1\121\67\0\1\175\64\0\1\120\2\0\2\121"+
+    "\63\0\1\125\64\0\1\176\71\0\1\177\64\0\1\123"+
+    "\2\0\2\125\61\0\1\200\1\201\65\0\1\202\1\203"+
+    "\65\0\1\204\66\0\1\205\66\0\1\206\66\0\1\207"+
+    "\1\210\65\0\1\211\1\212\65\0\1\213\1\214\65\0"+
+    "\1\215\1\216\65\0\1\217\66\0\1\213\65\0\1\220"+
+    "\126\0\1\221\25\0\1\222\10\0\1\223\67\0\1\224"+
+    "\54\0\1\225\12\0\1\223\114\0\1\226\70\0\1\227"+
+    "\66\0\1\230\23\0\1\231\10\0\1\71\67\0\1\232"+
+    "\54\0\1\233\12\0\1\71\60\0\1\234\57\0\2\104"+
+    "\3\0\1\104\2\0\2\104\1\0\4\104\2\0\1\104"+
+    "\6\0\1\104\5\0\1\104\2\0\1\104\2\0\4\104"+
+    "\1\0\1\104\11\0\1\104\7\0\1\127\66\0\1\133"+
+    "\66\0\1\235\66\0\1\141\70\0\1\236\66\0\1\237"+
+    "\66\0\1\240\66\0\1\241\66\0\1\242\66\0\1\243"+
+    "\60\0\2\152\3\0\1\152\2\0\2\152\1\0\4\152"+
+    "\2\0\1\152\6\0\1\152\5\0\1\152\2\0\1\152"+
+    "\2\0\4\152\1\0\1\152\11\0\1\152\7\0\1\244"+
+    "\65\0\1\245\65\0\1\246\67\0\1\247\67\0\1\250"+
+    "\66\0\1\251\66\0\1\252\65\0\1\253\66\0\1\254"+
+    "\67\0\1\255\71\0\1\256\66\0\1\257\66\0\1\260"+
+    "\66\0\1\261\66\0\1\150\66\0\1\262\72\0\1\223"+
+    "\56\0\1\263\100\0\1\223\64\0\1\71\70\0\1\71"+
+    "\55\0\1\200\66\0\1\202\66\0\1\207\66\0\1\211"+
+    "\66\0\1\215\60\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[6095];
+    int [] result = new int[6930];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -279,17 +299,18 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\6\0\1\1\1\0\3\11\1\1\2\11\4\1\1\11"+
-    "\1\1\4\11\32\1\1\11\3\0\4\11\1\0\4\11"+
-    "\1\0\1\11\20\0\1\11\3\0\1\11\3\0\1\11"+
-    "\1\0\1\11\3\0\3\11\1\0\2\11\2\0\1\11"+
-    "\16\0\1\11\1\0\1\11\1\0\1\11\1\0\4\11"+
-    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
-    "\3\0\1\1\5\0\1\11\2\0\1\11\6\0\6\11"+
-    "\5\0";
+    "\10\0\1\1\1\0\3\11\2\1\1\11\4\1\1\11"+
+    "\1\1\4\11\20\1\1\11\12\1\2\11\4\0\4\11"+
+    "\1\0\4\11\1\0\1\11\20\0\1\11\3\0\1\11"+
+    "\3\0\1\11\1\0\1\11\3\0\3\11\1\0\2\11"+
+    "\2\0\1\11\11\0\1\11\6\0\1\11\1\0\1\11"+
+    "\1\0\1\11\1\0\4\11\1\0\1\11\1\0\1\11"+
+    "\1\0\1\11\1\0\1\11\4\0\1\1\5\0\1\11"+
+    "\2\0\1\11\6\0\1\11\3\0\3\11\2\0\1\11"+
+    "\5\0\1\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[166];
+    int [] result = new int[179];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -363,7 +384,7 @@
 	private static final int CONS = 1;
 	private static final int VOWEL = 2;
 	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
-
+	
 	private String original = "";
 	private String normalized = "";
 	private int problem = 0;
@@ -373,6 +394,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -404,7 +427,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 184) {
+    while (i < 190) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -704,70 +727,81 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 39: 
+        case 41: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("um");
           }
-        case 48: break;
-        case 28: 
+        case 50: break;
+        case 30: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = CONS; add("U");
           }
-        case 49: break;
-        case 4: 
+        case 51: break;
+        case 15: 
           { add(yytext());
           }
-        case 50: break;
-        case 46: 
+        case 52: break;
+        case 48: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 3;
           { add("Hic");
           }
-        case 51: break;
-        case 9: 
+        case 53: break;
+        case 8: 
           { cv = VOWEL; add("AE");
           }
-        case 52: break;
+        case 54: break;
         case 1: 
           { problem = 1; cv = 0; add(yytext());
           }
-        case 53: break;
-        case 5: 
+        case 55: break;
+        case 4: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 54: break;
-        case 18: 
+        case 56: break;
+        case 20: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = CONS; add("u");
           }
-        case 55: break;
-        case 21: 
+        case 57: break;
+        case 10: 
+          { cv = 0; add(yytext());
+          }
+        case 58: break;
+        case 12: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 59: break;
+        case 36: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("et");
+          }
+        case 60: break;
+        case 23: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("e");
           }
-        case 56: break;
-        case 29: 
+        case 61: break;
+        case 31: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
           }
-        case 57: break;
-        case 34: 
-          // lookahead expression with fixed base length
-          zzMarkedPos = zzStartRead + 2;
-          { add("et");
-          }
-        case 58: break;
-        case 41: 
+        case 62: break;
+        case 43: 
           // general lookahead, find correct zzMarkedPos
-          { int zzFState = 5;
+          { int zzFState = 7;
             int zzFPos = zzStartRead;
             if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; }
             boolean zzFinL[] = zzFin;
@@ -778,7 +812,7 @@
             }
             if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; } 
 
-            zzFState = 6;
+            zzFState = 8;
             zzFPos = zzMarkedPos;
             while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) {
               zzInput = zzBufferL[--zzFPos];
@@ -788,20 +822,20 @@
           }
           { cv = VOWEL; add(yytext().replace("ſ", "s"));
           }
-        case 59: break;
+        case 63: break;
         case 3: 
           { cv = CONS; add(yytext());
           }
-        case 60: break;
-        case 27: 
+        case 64: break;
+        case 29: 
           { cv = VOWEL; add("oi");
           }
-        case 61: break;
-        case 25: 
+        case 65: break;
+        case 27: 
           { cv = CONS; add("QU");
           }
-        case 62: break;
-        case 15: 
+        case 66: break;
+        case 17: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { switch(cv) {
@@ -809,171 +843,171 @@
 			default: cv = VOWEL; add(yytext()); break;
 		}
           }
-        case 63: break;
-        case 7: 
+        case 67: break;
+        case 6: 
           { cv = CONS; add("ss");
           }
-        case 64: break;
-        case 6: 
+        case 68: break;
+        case 5: 
           { cv = CONS; add("s");
           }
-        case 65: break;
-        case 22: 
+        case 69: break;
+        case 11: 
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized.replaceAll(LB, "");
+		}
+          }
+        case 70: break;
+        case 24: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("o");
           }
-        case 66: break;
-        case 33: 
+        case 71: break;
+        case 35: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("ac");
           }
-        case 67: break;
+        case 72: break;
         case 2: 
           { cv = VOWEL; add(yytext());
           }
-        case 68: break;
-        case 43: 
+        case 73: break;
+        case 45: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 3;
           { add("qui");
           }
-        case 69: break;
-        case 35: 
+        case 74: break;
+        case 37: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("er");
           }
-        case 70: break;
-        case 24: 
+        case 75: break;
+        case 26: 
           { cv = CONS; add("Qu");
           }
-        case 71: break;
-        case 30: 
+        case 76: break;
+        case 32: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("ve");
           }
-        case 72: break;
-        case 38: 
+        case 77: break;
+        case 40: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("us");
           }
-        case 73: break;
-        case 32: 
+        case 78: break;
+        case 34: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("am");
           }
-        case 74: break;
-        case 8: 
+        case 79: break;
+        case 7: 
           { cv = VOWEL; add("ae");
           }
-        case 75: break;
-        case 11: 
-          { switch (problem) {
-			case 1: return "";
-			default: return normalized;
-		}
-          }
-        case 76: break;
-        case 26: 
+        case 80: break;
+        case 28: 
           { add("ar");
           }
-        case 77: break;
-        case 45: 
+        case 81: break;
+        case 47: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 3;
           { add("hic");
           }
-        case 78: break;
-        case 17: 
+        case 82: break;
+        case 19: 
           { cv = VOWEL; add("uu");
           }
-        case 79: break;
-        case 40: 
+        case 83: break;
+        case 42: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("ul");
           }
-        case 80: break;
-        case 20: 
+        case 84: break;
+        case 22: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("a");
           }
-        case 81: break;
-        case 10: 
+        case 85: break;
+        case 9: 
           { cv = VOWEL; add("oe");
           }
-        case 82: break;
-        case 16: 
+        case 86: break;
+        case 18: 
           { cv = VOWEL; add("ui");
           }
-        case 83: break;
-        case 14: 
+        case 87: break;
+        case 16: 
           { cv = CONS; add("qu");
           }
-        case 84: break;
-        case 47: 
+        case 88: break;
+        case 49: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 4;
           { add("que");
           }
-        case 85: break;
-        case 23: 
+        case 89: break;
+        case 25: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("u");
           }
-        case 86: break;
-        case 36: 
+        case 90: break;
+        case 38: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("es");
           }
-        case 87: break;
-        case 44: 
+        case 91: break;
+        case 46: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 3;
           { add("Qui");
           }
-        case 88: break;
-        case 42: 
+        case 92: break;
+        case 44: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 1;
           { add("i");
           }
-        case 89: break;
-        case 12: 
+        case 93: break;
+        case 13: 
           { add("X");
           }
-        case 90: break;
-        case 13: 
+        case 94: break;
+        case 14: 
           { switch(cv) {
 			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
 			default: cv = CONS; add(yytext()); break;
 		}
           }
-        case 91: break;
-        case 19: 
+        case 95: break;
+        case 21: 
           { cv = VOWEL; add("ii");
           }
-        case 92: break;
-        case 31: 
+        case 96: break;
+        case 33: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("as");
           }
-        case 93: break;
-        case 37: 
+        case 97: break;
+        case 39: 
           // lookahead expression with fixed base length
           zzMarkedPos = zzStartRead + 2;
           { add("od");
           }
-        case 94: break;
+        case 98: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -26,7 +25,7 @@
 	private static final int CONS = 1;
 	private static final int VOWEL = 2;
 	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
-
+	
 	private String original = "";
 	private String normalized = "";
 	private int problem = 0;
@@ -35,20 +34,25 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
-Vowel = [AEIOUaeiou] // without Ææęàèòùœ
+Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc.
 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+// y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); }
+
 LR = [lLrR]
 
 hyphen = [\u002d\u00ad]  // hyphen and soft hyphen
-X = {hyphen}?
+LB = {hyphen} \u0020
+lb = ({hyphen} \u0020)?
 
 END = \n
 
 que = (que)?  // optional -que
 enclitic = (que | ve | ne)
-prefixCons = (in{X}ter | per | ſu{X}per | ſer) // "ſer" for forms of ſervare
+prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare
 
 %%
 
@@ -127,7 +131,7 @@
 // 3.1 rules for u --> v
 
 // peruenias --> pervenias, interuallum --> intervallum
-^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); }  // not cv = CONS !
+^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); }  // not cv = CONS !
 
 // uellet --> vellet
 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
@@ -159,22 +163,23 @@
 	}
 
 // februarivs --> februarius
-v / {X} {Cons} { cv = CONS; add("u"); }
-V / {X} {Cons} { cv = CONS; add("U"); }
+v / {lb} {Cons} { cv = CONS; add("u"); }
+V / {lb} {Cons} { cv = CONS; add("U"); }
 
 // 3.3 override default rule for .
 
 {Vowel} { cv = VOWEL; add(yytext()); }
 {Cons} { cv = CONS; add(yytext()); }
-{hyphen} { add(yytext()); }
+[yY] { cv = 0; add(yytext()); }
 
-. { problem = 1; cv = 0; add(yytext()); }  // in particular "@", and from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
+@ { problem = 1; cv = 0; add(yytext()); }
+{LB} { add(yytext()); }
+. { problem = 1; cv = 0; add(yytext()); }  // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
 
 }
 
 
-<DISP, SEARCH, 
-RENAISSANCE_DISP, RENAISSANCE_SEARCH> {
+<DISP, RENAISSANCE_DISP> {
 
 {END} {
 		switch (problem) {
@@ -184,13 +189,22 @@
 	}
 }
 
-<DICT, 
-RENAISSANCE_DICT> {
+<DICT,  RENAISSANCE_DICT> {
 
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH, RENAISSANCE_SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
 		}
 	}
 }
@@ -200,7 +214,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 
 TO DO:
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for Dutch text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:04 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexNL.lex</tt>
  */
 public class MpdlNormalizerLexNL {
@@ -40,14 +39,16 @@
    * l is of the form l = 2*k, k a non negative integer
    */
   private static final int ZZ_LEXSTATE[] = { 
-     0,  0,  1,  1,  2,  2,  1, 1
+     0,  0,  1,  1,  2,  2,  3, 3
   };
 
   /** 
    * Translates characters to character classes
    */
   private static final String ZZ_CMAP_PACKED = 
-    "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0";
+    "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+    "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+    "\u0101\0\1\4\ufe80\0";
 
   /** 
    * Translates characters to character classes
@@ -60,10 +61,10 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\3\0\1\1\1\2\1\3\1\4\1\5";
+    "\4\0\2\1\1\2\1\3\1\4\1\5\1\6";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -88,10 +89,11 @@
   private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
 
   private static final String ZZ_ROWMAP_PACKED_0 =
-    "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14";
+    "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+
+    "\0\30\0\30\0\30";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -114,11 +116,13 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+
-    "\1\4\1\10\1\7\1\5\4\0";
+    "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+
+    "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+
+    "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+
+    "\10\0\1\5\3\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[16];
+    int [] result = new int[36];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -156,10 +160,10 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\3\0\5\11";
+    "\4\0\1\11\1\1\5\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[8];
+    int [] result = new int[11];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -236,6 +240,8 @@
 		normalized += norm;
 	}
 
+	private static final String LB = "[\u002d\u00ad] ";
+
 
   /**
    * Creates a new scanner
@@ -267,7 +273,7 @@
     char [] map = new char[0x10000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 14) {
+    while (i < 46) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -537,29 +543,36 @@
         case 5: 
           { switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
 		}
           }
-        case 6: break;
+        case 7: break;
         case 2: 
           { problem = 1; add(yytext());
           }
-        case 7: break;
+        case 8: break;
         case 4: 
           { add("s");
           }
-        case 8: break;
+        case 9: break;
         case 3: 
           { switch (problem) {
 			case 1: return original;
 			default: return normalized;
 		}
           }
-        case 9: break;
+        case 10: break;
+        case 6: 
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+          }
+        case 11: break;
         case 1: 
           { add(yytext());
           }
-        case 10: break;
+        case 12: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-07-12
  *
  */
 
@@ -30,8 +29,14 @@
 		original += yytext(); 
 		normalized += norm;
 	}
+
+	private static final String LB = "[\u002d\u00ad] ";
 %}
 
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
 END = \n
 
 %%
@@ -46,10 +51,11 @@
 // default
 
 @ { problem = 1; add(yytext()); }
-. { add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); } 
 
 
-<DISP, SEARCH> {
+<DISP> {
 
 {END} {
 		switch (problem) {
@@ -64,7 +70,17 @@
 {END} {
 		switch (problem) {
 			case 1: return "";
-			default: return normalized;
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
 		}
 	}
 }
@@ -74,7 +90,7 @@
 
 Annahmen:
 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
 
 TO DO:
 
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexTemplate.lex
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexTemplate.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -0,0 +1,89 @@
+/*
+ * Template for normalization rules
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle 
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexTemplate
+%type java.lang.String
+%unicode
+
+// Language: list of ISO codes
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+	
+	private void add (String norm) {
+		original += yytext(); 
+		normalized += norm;
+	}	
+
+	private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+%%
+
+<DISP, DICT, SEARCH> {
+
+ſ { add("s"); } // sample rule
+
+}
+
+
+// default rules
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+// at the end, determine which string to return
+
+<DISP> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+	}
+}
+
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java	Mon Aug 29 17:40:02 2011 +0200
@@ -1,12 +1,11 @@
-/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
 
 /*
  * Normalization rules for Chinese text
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-02-28
  *
  */
 
@@ -16,7 +15,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 22.02.11 12:04 from the specification file
+ * on 21.07.11 11:22 from the specification file
  * <tt>MpdlNormalizerLexZH.lex</tt>
  */
 public class MpdlNormalizerLexZH {
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex	Mon Aug 29 17:40:02 2011 +0200
@@ -3,8 +3,7 @@
  * [this is a JFlex specification]
  *
  * Wolfgang Schmidle 
- * version 0.96
- * 2011-02-21
+ * version 2011-02-28
  *
  */
 
@@ -107,13 +106,15 @@
 /*
 
 Annahmen:
-- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
-- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+- die Routine wird zeichenweise (oder mit mehr als einem Zeichen) aufgerufen, mit einem \n am Ende des Strings
+- es gibt keine Zeilenumbrüche
 
 TO DO:
 
 ZH: Liste ergänzen
 ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren.
 ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist?
+ZH: sollen Zeilenumbrüche rausgenommen werden, auch wenn sie in korrekt markiertem Text nicht vorkommen?
+ZH: was ist, wenn beijing übergeben wird und einen Zeilenumbruch enthält? Verlässt sich der Wrapper darauf, dass die Zeichenzahl gleich bleibt, oder macht er ein hyphen rein? was macht <place> oder <reg>?
 
 */
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Mon Aug 29 17:40:02 2011 +0200
@@ -6,6 +6,7 @@
 import org.xml.sax.*;
 
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
@@ -122,6 +123,12 @@
     }
   }
 
+  /**
+   * 
+   * @param compositesCharsDictionarized contains the dictionarized characters, e.g. <w lang="de" form="bla">bla</w><w lang="de" form="blabla">blabla</w>
+   * @param indexComplexElemCompositesCharsWithMarks index of the first complex element in the string
+   * @return
+   */
   public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) {
     if (indexComplexElemCompositesCharsWithMarks == 0)
       return -1;
@@ -146,6 +153,10 @@
         isInTag = false;
       counter++;
     }
+    // little hack: also the first </w> after the counter has to be included in the result string
+    String tail = compositesCharsDictionarized.substring(counter);
+    if (tail.startsWith("</w>"))
+      counter = counter + 4;
     return counter + 1;
   }
   
@@ -185,7 +196,7 @@
      */
     private boolean isWordDelimiterElement() {
       boolean isWordDelimiterElement = true;
-      if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor"))
+      if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor"))
         isWordDelimiterElement = false;
       return isWordDelimiterElement;
     }
@@ -256,7 +267,11 @@
       String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved);
       String retStr = "";
       try {
-        MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language);
+        MpdlNormalizer mpdlDictNormalizer = new MpdlNormalizer(language);
+        mpdlDictNormalizer.setNormMode(MpdlNormalizer.DICTIONARY);
+        MpdlNormalizer mpdlDisplayNormalizer = new MpdlNormalizer(language);
+        mpdlDisplayNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
+        MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(mpdlDictNormalizer, language);
         ArrayList<Token> wordTokens = dictionarizerAnalyzer.getToken(charactersStr);
         int endPos = 0;
         for (int i=0; i < wordTokens.size(); i++) {
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java	Mon Aug 29 17:40:02 2011 +0200
@@ -0,0 +1,352 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.doc;
+
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.Token;
+import org.xml.sax.*;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
+
+public class NormDictContentHandler implements ContentHandler {
+  private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString();  // word delimiting element
+  private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString();  // not word delimiting element
+  private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
+  private static int ELEMENT_TYPE_CHARACTERS = 1;
+  private static int ELEMENT_TYPE_COMPLEX = 2;
+  private String[] normalizeFunctions = {};  // default: without normalize functions
+  private boolean dictMode = false;  // default: not in dictionary mode
+  private String xmlnsString = "";
+  private String language;
+  private String outputXmlFragment = "";
+  private Element rootElement;
+  private Element currentElement;
+  private ArrayList<Element> elementQueue;
+  
+  public NormDictContentHandler(String[] normalizeFunctions, String language) throws ApplicationException {
+    if (normalizeFunctions == null) {
+      String[] emptyFunctions = {};
+      this.normalizeFunctions = emptyFunctions;
+    } else {
+      this.normalizeFunctions = normalizeFunctions;
+    }
+    this.language = language;
+  }
+
+  public void setDictMode(boolean dictMode) {
+    this.dictMode = dictMode;
+  }
+  
+  public String getXmlFragment() {
+    return outputXmlFragment;  
+  }
+  
+  public void startDocument() throws SAXException {
+  }
+
+  public void endDocument() throws SAXException {
+    String rootElemToStr = rootElement.toXmlString();
+    write(rootElemToStr);
+    write("\n");
+  }
+  
+  public void characters(char[] c, int start, int length) throws SAXException {
+    char[] cCopy = new char[length];
+    System.arraycopy(c, start, cCopy, 0, length);
+    String charactersStr = String.valueOf(cCopy);
+    if (charactersStr != null && ! charactersStr.equals("")) {
+      if (currentElement != null) {
+        Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
+        charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
+        if (currentElement.composites == null)
+          currentElement.composites = new ArrayList<Element>();
+        currentElement.composites.add(charElement);
+      }
+    }
+  }
+
+  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
+  }
+
+  public void processingInstruction(String target, String data) throws SAXException {
+  }
+
+  public void setDocumentLocator(Locator locator) {
+  }
+
+  public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
+    if (prefix != null && prefix.equals(""))  
+      xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
+  }
+  
+  public void endPrefixMapping(String prefix) throws SAXException {
+  }
+
+  public void skippedEntity(String name) throws SAXException {
+  }
+
+  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
+    if (elementQueue == null)
+      elementQueue = new ArrayList<Element>();
+    Element newElement = new Element(name); // element of type: complex
+    if (currentElement != null) {
+      if (currentElement.composites == null)
+        currentElement.composites = new ArrayList<Element>();
+      if (currentElement.lang != null)
+        newElement.lang = currentElement.lang;  // language is inherited to childs
+      currentElement.composites.add(newElement);
+    }
+    currentElement = newElement;
+    int attrSize = attrs.getLength();
+    String attrString = "";
+    for (int i=0; i<attrSize; i++) {
+      String attrQName = attrs.getQName(i);
+      String attrValue = attrs.getValue(i);
+      attrValue = StringUtilEscapeChars.forXML(attrValue);
+      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
+      if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
+        currentElement.lang = attrValue;  // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
+    }
+    currentElement.attrString = attrString;
+    if (! xmlnsString.equals("")) {
+      currentElement.xmlnsString = xmlnsString;
+    }
+    xmlnsString = "";
+    elementQueue.add(currentElement);
+    // only the first element is the root element
+    if(rootElement == null)
+      rootElement = currentElement;
+  }
+
+  public void endElement(String uri, String localName, String name) throws SAXException {
+    if (elementQueue != null && elementQueue.size() > 0) {
+      int lastIndex = elementQueue.size() - 1;
+      elementQueue.remove(lastIndex);
+    }
+    if (elementQueue != null && elementQueue.size() > 0) {
+      int lastIndex = elementQueue.size() - 1;
+      currentElement = elementQueue.get(lastIndex);
+    } else {
+      currentElement = null;
+    }
+  }
+
+  private void write(String outStr) throws SAXException {
+    outputXmlFragment += outStr;
+  }
+  
+  private class Element {
+    private int type;
+    private String name;
+    private String xmlnsString;
+    private String attrString;
+    private String value;
+    private String lang;  // normally value of attribute xml:lang or the inherited xml:lang value of the father node
+    private ArrayList<Element> composites;
+    
+    private Element(String name) {
+      this.type = ELEMENT_TYPE_COMPLEX;
+      this.name = name;
+    }
+
+    private Element(String name, int type) {
+      this.type = type;
+      this.name = name;
+    }
+
+    private boolean isComplex() {
+      boolean isComplex = false;
+      if (type == ELEMENT_TYPE_COMPLEX)
+        isComplex = true;
+      return isComplex;
+    }
+    
+    /**
+     * feel free to add/remove some element names; element content must be empty 
+     * @return true if element is a word delimiter element else false
+     */
+    private boolean isWordDelimiterElement() {
+      boolean isWordDelimiterElement = true;
+      // "note" causes problems: word after the note is not recognized
+      // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte" 
+      if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor"))  
+        isWordDelimiterElement = false;
+      return isWordDelimiterElement;
+    }
+    
+    private String toXmlString() throws SAXException {
+      String retString = "";
+      String elemLanguage = language;  // default value for the document/page
+      if (lang != null)
+        elemLanguage = lang;  // value of the element if available 
+      // write this element
+      if (! isComplex()) {
+        retString += value;
+      } else {
+        String xmlNsString = this.xmlnsString;
+        if (xmlNsString == null || xmlNsString.equals("")) {
+          retString = retString + "<" + name + attrString + ">";
+        } else { 
+          retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
+        }
+        if (composites != null) {
+          String compositesCharsWithMarks = "";
+          ArrayList<Element> complexElements = new ArrayList<Element>();
+          for (int i=0; i<composites.size(); i++) {
+            Element composite = composites.get(i);
+            if (! composite.isComplex()) {
+              if (composite.value != null && ! composite.value.equals("")) {
+                String compositeValueStr = composite.value;
+                compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
+                compositeValueStr = compositeValueStr.replaceAll(" +", " "); // if there are many Blanks make them to one
+                compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
+              }
+            } else {
+              if (! composite.isWordDelimiterElement()) {
+                compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK;  // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
+              } else {
+                compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK;  // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
+              }
+              complexElements.add(composite);
+            }
+          }
+          compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK);  // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
+          String compositesCharsWithMarksNormalized = normalizeWords(compositesCharsWithMarks, elemLanguage);
+          compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK);  // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
+          if (complexElements.size() > 0) {
+            for (int i=0; i<complexElements.size(); i++) {
+              int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksNormalized.indexOf(COMPLEX_ELEMENT_MARK);
+              Element complexElem = complexElements.get(i);
+              String complexElementStr = complexElem.toXmlString();
+              String firstPiece = "";
+              if (indexComplexElemCompositesCharsWithMarks > 0) {
+                firstPiece = compositesCharsWithMarksNormalized.substring(0, indexComplexElemCompositesCharsWithMarks);
+                compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.substring(indexComplexElemCompositesCharsWithMarks);
+              }
+              retString = retString + firstPiece + complexElementStr;
+              compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.substring(COMPLEX_ELEMENT_MARK_SIZE);
+            }
+            retString = retString + compositesCharsWithMarksNormalized; // last one must also be added
+          } else {
+            retString = retString + compositesCharsWithMarksNormalized; // last one must also be added
+          }
+        }
+        retString = retString + "</" + name + ">";
+      } 
+      return retString;
+    }
+    
+    private String normalizeWords(String charactersStrDeresolved, String language) throws SAXException {
+      String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved);
+      String retStr = "";
+      try {
+        MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
+        if (dictMode) {
+          mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY);
+        } else {
+          mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
+        }
+        MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language);
+        tokenAnalyzer.setRegWithoutSemicolon(true);  // hack: feel free to remove it later
+        ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr);
+        int endPos = 0;
+        for (int i=0; i < wordTokens.size(); i++) {
+          Token wordToken = wordTokens.get(i);
+          int startPos = wordToken.startOffset();
+          String beforeStr = charactersStr.substring(endPos, startPos);
+          endPos = wordToken.endOffset();
+          String displayWordStr = charactersStr.substring(startPos, endPos);
+          String normalizedWord = displayWordStr;
+          if (! dictMode) {
+            normalizedWord = normalize(mpdlNormalizer, displayWordStr);  // normalizer in DISPLAY mode
+            normalizedWord = StringUtilEscapeChars.deresolveXmlEntities(normalizedWord);
+          } else {
+            normalizedWord = getLexWord(mpdlNormalizer, displayWordStr); // normalizer in DICTIONARY mode
+          }
+          String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
+          retStr = retStr + beforeStrDeresolved + normalizedWord;
+        }
+        String lastAfterStr = charactersStr.substring(endPos);
+        String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr);
+        retStr = retStr + lastAfterStrDeresolved;
+      } catch (ApplicationException e) {
+        throw new SAXException(e);
+      }
+      return retStr;
+    }
+
+    /**
+     * if word contains "not word delimiting symbol" (e.g. for line break) it is replaced 
+     * by a "Blank" so that the Lex normalizer could handle it. Other cases see below.
+     * The Lex normalizer then e.g. gets "præbi- ta" and normalize it to "praebi- ta".
+     * @param mpdlNormalizer Lex normalizer
+     * @param word 
+     * @return normalized word
+     * @throws ApplicationException
+     */
+    private String normalize(MpdlNormalizer mpdlNormalizer, String word) throws ApplicationException {
+      if (word.trim().isEmpty())
+        return word;
+      String cleanedWord = word;
+      // starting nwd mark and more than one nwd mark are removed before normalization; after normalization they are added again
+      boolean startsWithNWDMark = cleanedWord.startsWith(COMPLEX_ELEMENT_NWD_MARK);   
+      if (startsWithNWDMark)
+        cleanedWord = cleanedWord.substring(1);
+      int countNWDMarks = cleanedWord.length() - cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length();
+      if (countNWDMarks > 1)
+        cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK + "+", COMPLEX_ELEMENT_NWD_MARK);
+      // boolean notHyphenPlusNWD = cleanedWord.matches(".*[^-]+" + COMPLEX_ELEMENT_NWD_MARK + "+.*");  // e.g. "praebi ta"
+      // if (notHyphenPlusNWD)
+      //   cleanedWord = cleanedWord.replaceAll("([^-]+)" + COMPLEX_ELEMENT_NWD_MARK + "+", "$1-" + COMPLEX_ELEMENT_NWD_MARK);  // e.g. "praebi ta" is replaced by "praebi- ta" 
+      String inputWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, " "); 
+      String normalizedWordStr = mpdlNormalizer.normalize(inputWord);
+      normalizedWordStr = normalizedWordStr.replaceAll(" ", COMPLEX_ELEMENT_NWD_MARK);
+      // if (notHyphenPlusNWD)
+      //   normalizedWordStr = normalizedWordStr.replaceAll("-" + COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_NWD_MARK);  // e.g. "praebi- ta" is replaced by "praebi ta"
+      if (countNWDMarks > 1) {
+        String nwdStr = "";
+        for (int i=0; i<countNWDMarks; i++)
+          nwdStr += COMPLEX_ELEMENT_NWD_MARK;
+        normalizedWordStr = normalizedWordStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, nwdStr);
+      }
+      if (startsWithNWDMark)
+        normalizedWordStr = COMPLEX_ELEMENT_NWD_MARK + normalizedWordStr;
+      return normalizedWordStr;
+    }
+    
+    private String getLexWord(MpdlNormalizer mpdlNormalizer, String displayWord) throws ApplicationException {
+      String lexWord = null;
+      String wordForm = removeSpecialSymbols(displayWord);
+      wordForm = wordForm.toLowerCase();
+      wordForm = normalize(mpdlNormalizer, wordForm);
+      // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
+      LexHandler lexHandler = LexHandler.getInstance();
+      String lang = mpdlNormalizer.getLanguage();
+      ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordForm, lang, false);
+      String displayWordDeresolved = StringUtilEscapeChars.deresolveXmlEntities(displayWord);
+      if (lexEntryKeys != null) {
+        String lexForms = "";
+        for (int j=0; j<lexEntryKeys.size(); j++) {
+          String lexEntryKey = lexEntryKeys.get(j);
+          lexForms = lexForms + lexEntryKey + " ";
+        }
+        lexForms = lexForms.substring(0, lexForms.length() - 1);
+        lexWord = "<w lang=\"" + language + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>";
+      } else {
+        lexWord = displayWordDeresolved;
+      }
+      return lexWord;
+    }
+    
+    private String removeSpecialSymbols(String inputStr) {
+      String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", "");
+      return retStr;
+    }
+    
+  }
+
+}
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Mon Aug 29 17:40:02 2011 +0200
@@ -115,10 +115,9 @@
         endPos = wordToken.endOffset();
         String wordStr = charactersStr.substring(startPos, endPos);
         MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
-        mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS);
+        mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
         String normalizedWordStr = mpdlNormalizer.normalize(wordStr);
         String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr);
-        // String wordTokenText = wordToken.termText();
         retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;
       }
       String lastAfterStr = charactersStr.substring(endPos);
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Mon Aug 29 17:40:02 2011 +0200
@@ -39,63 +39,90 @@
   }
   
   public boolean isLatin(String language) {
-    if (getLanguageId(language).equals("la"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("la"))
       return true;
     else 
       return false;
   }
 
   public boolean isGerman(String language) {
-    if (getLanguageId(language).equals("de"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("de"))
       return true;
     else 
       return false;
   }
 
   public boolean isFrench(String language) {
-    if (getLanguageId(language).equals("fr"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("fr"))
       return true;
     else 
       return false;
   }
 
   public boolean isEnglish(String language) {
-    if (getLanguageId(language).equals("en"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("en"))
       return true;
     else 
       return false;
   }
 
   public boolean isDutch(String language) {
-    if (getLanguageId(language).equals("nl"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("nl"))
       return true;
     else 
       return false;
   }
 
   public boolean isGreek(String language) {
-    if (getLanguageId(language).equals("el"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("el"))
       return true;
     else 
       return false;
   }
 
   public boolean isArabic(String language) {
-    if (getLanguageId(language).equals("ar"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("ar"))
       return true;
     else 
       return false;
   }
 
   public boolean isItalian(String language) {
-    if (getLanguageId(language).equals("it"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("it"))
       return true;
     else 
       return false;
   }
 
   public boolean isChinese(String language) {
-    if (getLanguageId(language).equals("zh"))
+    String langId = getLanguageId(language);
+    if (langId == null)
+      return false;
+    if (langId.equals("zh"))
       return true;
     else 
       return false;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocJob.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocJob.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocJob.java	Mon Aug 29 17:40:02 2011 +0200
@@ -13,6 +13,7 @@
 import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocIngestor;
 import de.mpg.mpiwg.berlin.mpdl.escidoc.ESciDocRestSession;
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
 import de.mpg.mpiwg.berlin.mpdl.xmlrpc.MpdlXmlRpcDocHandler;
 
 public class MpdlDocJob implements Job {
@@ -23,14 +24,19 @@
   public void execute(JobExecutionContext context) throws JobExecutionException {
     this.currentExecutedContext = context;
     MpdlDocOperation docOperation = getDocOperation();
-    docOperation.setIncludePdf(true); // default is true: handle also Pdf/Html version of the document
+    boolean generatePDF = MpdlConstants.MPDL_GENERATE_PDF; 
+    docOperation.setIncludePdf(generatePDF); // default is true: handle also Pdf/Html version of the document
     try {
       docOperation.setStatus(STATUS_BEGIN);
       String operationName = docOperation.getName();   
       String cookieId = docOperation.getESciDocCookieId();
       MpdlXmlRpcDocHandler mpdlXmlRpcDocHandler = MpdlXmlRpcDocHandler.getInstance();
-      ESciDocRestSession eSciDocSession = ESciDocRestSession.getInstance(cookieId);
-      ESciDocIngestor eSciDocIngestor = new ESciDocIngestor(eSciDocSession);
+      ESciDocRestSession eSciDocSession = null;
+      ESciDocIngestor eSciDocIngestor = null;
+      if (docOperation.isESciDocOperation()) {
+        eSciDocSession = ESciDocRestSession.getInstance(cookieId);
+        eSciDocIngestor = new ESciDocIngestor(eSciDocSession);
+      }
       if (operationName.equals("create") || operationName.equals("update")) {
         DocumentHandler docHandler = new DocumentHandler(mpdlXmlRpcDocHandler, eSciDocIngestor);
         docHandler.doOperation(docOperation);
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocOperation.java
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocOperation.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/schedule/MpdlDocOperation.java	Mon Aug 29 17:40:02 2011 +0200
@@ -46,6 +46,13 @@
       return false;
   }
   
+  public boolean isESciDocOperation() {
+    if (name.equals("create") || name.equals("update") || name.equals("delete"))
+      return true;
+    else 
+      return false;
+  }
+
   public boolean isError() {
     if (errorMessage != null && errorMessage.length() > 0)
       return true;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/Dictionarize.java
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/Dictionarize.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/Dictionarize.java	Mon Aug 29 17:40:02 2011 +0200
@@ -44,7 +44,7 @@
 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
 
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
-import de.mpg.mpiwg.berlin.mpdl.lt.doc.DictionarizerContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.doc.NormDictContentHandler;
 
 /**
  * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de)
@@ -76,7 +76,9 @@
 		language = languageSeq.getStringValue();
 		String outputXmlFragment = null;
 		try { 
-		  DictionarizerContentHandler dictContentHandler = new DictionarizerContentHandler(language);
+		  String[] normFunctions = {"reg", "norm"};
+		  NormDictContentHandler dictContentHandler = new NormDictContentHandler(normFunctions, language);
+		  dictContentHandler.setDictMode(true);
       XMLReader xmlParser = new SAXParser();
       xmlParser.setContentHandler(dictContentHandler);
       Reader stringReaderXmlFragment = new StringReader(xmlFragment);
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java	Mon Aug 29 17:40:02 2011 +0200
@@ -23,6 +23,7 @@
 package org.exist.xquery.modules.mpdltext;
 
 import java.util.ArrayList;
+import java.util.Date;
 
 import org.exist.dom.QName;
 import org.exist.xquery.BasicFunction;
diff -r 469d927b9ca7 -r 5df60f24e997 software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/NormalizeChars.java
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/NormalizeChars.java	Tue Apr 19 16:51:59 2011 +0200
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/NormalizeChars.java	Mon Aug 29 17:40:02 2011 +0200
@@ -44,7 +44,7 @@
 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
 
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
-import de.mpg.mpiwg.berlin.mpdl.lt.doc.NormalizeCharsContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.doc.NormDictContentHandler;
 
 /**
  * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de)
@@ -80,7 +80,7 @@
 		xmlFragment = xmlFragmentSeq.getStringValue();
 		String outputXmlFragment = null;
 		try { 
-		  NormalizeCharsContentHandler normCharsContentHandler = new NormalizeCharsContentHandler(normalizeFunctionsArray, language);
+		  NormDictContentHandler normCharsContentHandler = new NormDictContentHandler(normalizeFunctionsArray, language);
       XMLReader xmlParser = new SAXParser();
       xmlParser.setContentHandler(normCharsContentHandler);
       Reader stringReaderXmlFragment = new StringReader(xmlFragment);