Mercurial > hg > mpdl-group

--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Tue Feb 22 16:03:45 2011 +0100
@@ -8,12 +8,17 @@
 public class ExtElement extends ExtObject {
   private String pageNumber;
   private String xmlNodeId;
-  private String before;
+  private String before = "false";
   private String charPos;
   private String xpath;

+  public ExtElement() {
+    this.type = "element";
+  }
+
   public static ExtElement parseXmlStr(String xmlStr) throws ApplicationException {
     XmlUtil xmlUtil = XmlUtil.getInstance();
+    xmlUtil.setNsContext("general");
     String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null);
     String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null);
     String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null);
@@ -24,8 +29,6 @@
     String xpath = xmlUtil.evaluateToString(xmlStr, "/object/@xpath", null);
     String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
     Date modDate = xmlUtil.toDate(dateStr);
-    if (uid == null || docId == null || pageNumber == null)
-      throw new ApplicationException("one of the required fields could not be read in: " + xmlStr);
     ExtElement e = new ExtElement();
     e.setUid(uid);
     e.setModificationDate(modDate);
@@ -44,7 +47,13 @@
   }

   public String getXmlString() {
-    String xmlString = "<object";
+    String xmlString = "<object xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
+    xmlString = xmlString + " type=\"" + type + "\"";
+    if (modificationDate != null) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      String dateStr = xmlUtil.toXsDate(modificationDate);
+      xmlString = xmlString + " modificationDate=\"" + dateStr + "\"";
+    }
     if (uid != null)
       xmlString = xmlString + " uid=\"" + uid + "\"";
     if (documentId != null)
@@ -59,11 +68,6 @@
       xmlString = xmlString + " charPos=\"" + charPos + "\"";
     if (xpath != null)
       xmlString = xmlString + " xpath=\"" + xpath + "\"";
-    if (modificationDate != null) {
-      XmlUtil xmlUtil = XmlUtil.getInstance();
-      String dateStr = xmlUtil.toXsDate(modificationDate);
-      xmlString = xmlString + " modificationDate=\"" + dateStr + "\"";
-    }
     xmlString = xmlString + ">";
     if (content != null) {
       // write the uid and modificationDate into the content node
@@ -110,6 +114,13 @@
     this.pageNumber = pageNumber;
   }

+  public boolean isBefore() {
+    if (before != null && before.equals("true"))
+      return true;
+    else
+      return false;
+  }
+
   public String getBefore() {
     return before;
   }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java	Tue Feb 22 16:03:45 2011 +0100
@@ -2,12 +2,27 @@

 import java.util.Date;

+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
 public class ExtObject {
+  protected String type; // is set by subclass: element, query, ...
   protected String uid;
   protected Date modificationDate;
   protected String documentId;
   protected String content;

+  public String getXmlString() {
+    return null; // always handled in subclass
+  }
+
+  public ExtObject getInstance(String xmlStr) throws ApplicationException {
+    return null; // always handled in subclass
+  }
+
+  public String getType() {
+    return type;
+  }
+
   public String getUid() {
     return uid;
   }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java	Tue Feb 22 16:03:45 2011 +0100
@@ -9,8 +9,13 @@
   private String queryType;  // url, fulltext or fulltextMorph
   private String queryName;  // optional: name of the query

+  public ExtQuery() {
+    this.type = "query";
+  }
+
   public static ExtQuery parseXmlStr(String xmlStr) throws ApplicationException {
     XmlUtil xmlUtil = XmlUtil.getInstance();
+    xmlUtil.setNsContext("general");
     String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null);
     String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null);
     String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null);
@@ -18,46 +23,43 @@
     String queryName = xmlUtil.evaluateToString(xmlStr, "/object/@queryName", null);
     String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
     Date modDate = xmlUtil.toDate(dateStr);
-    if (uid == null || docId == null || queryType == null || content == null)
-      throw new ApplicationException("one of the required fields could not be read in: " + xmlStr);
     ExtQuery e = new ExtQuery();
     e.setUid(uid);
     e.setModificationDate(modDate);
     e.setDocumentId(docId);
     e.setQueryType(queryType);
     e.setQueryName(queryName);
-    e.setContent(content);
+    if (content != null && ! content.isEmpty())
+      e.setContent(content);
     return e;
   }

+  public ExtQuery getInstance(String xmlStr) throws ApplicationException {
+    return parseXmlStr(xmlStr);
+  }
+
   public String toString() {
     return getXmlString();
   }

   public String getXmlString() {
     String xmlString = "<object";
-    xmlString = xmlString + " type=\"" + "query" + "\"";
-    if (uid != null)
-      xmlString = xmlString + " uid=\"" + uid + "\"";
-    if (queryType != null)
-      xmlString = xmlString + " queryType=\"" + queryType + "\"";
-    if (queryName != null)
-      xmlString = xmlString + " queryName=\"" + queryName + "\"";
+    xmlString = xmlString + " type=\"" + type + "\"";
     if (modificationDate != null) {
       XmlUtil xmlUtil = XmlUtil.getInstance();
       String dateStr = xmlUtil.toXsDate(modificationDate);
       xmlString = xmlString + " modificationDate=\"" + dateStr + "\"";
     }
+    if (uid != null)
+      xmlString = xmlString + " uid=\"" + uid + "\"";
     if (documentId != null)
       xmlString = xmlString + " documentId=\"" + documentId + "\"";
+    if (queryType != null)
+      xmlString = xmlString + " queryType=\"" + queryType + "\"";
+    if (queryName != null)
+      xmlString = xmlString + " queryName=\"" + queryName + "\"";
     xmlString = xmlString + ">";
     if (content != null) {
-      // write the uid and modificationDate into the content node
-      if (! content.contains("uid")) {
-        int firstClose = content.indexOf(">");
-        if (firstClose != -1)
-          content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose);
-      }
       xmlString = xmlString + "<content>" + content + "</content>";
     }
     xmlString = xmlString + "</object>";
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Tue Feb 22 16:03:45 2011 +0100
@@ -10,7 +10,9 @@
 import com.sleepycat.je.DatabaseException;
 import com.sleepycat.je.LockMode;
 import com.sleepycat.je.OperationStatus;
+import com.sleepycat.je.Transaction;

+import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
 import de.mpg.mpiwg.berlin.mpdl.util.Util;
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
 import de.mpg.mpiwg.berlin.mpdl.externalObjects.db.DbEnvExternalObjects;
@@ -32,22 +34,50 @@
     return instance;
   }

-  public ArrayList<ExtElement> readExternalElements(String documentId, String pageNumber) throws ApplicationException {
-    return readDBExternalElements(documentId, pageNumber);
+  public void createExternalElement(ExtElement element) throws ApplicationException {
+    createDBExternalElement(element);
   }

-  public void writeExternalElement(ExtElement element) throws ApplicationException {
-    writeDBExternalElement(element);
+  public void updateExternalElement(ExtElement element) throws ApplicationException {
+    updateDBExternalElement(element);
   }

   public void deleteExternalElement(ExtElement element) throws ApplicationException {
     deleteDBExternalElement(element);
   }

-  private void writeDBExternalElement(ExtElement element) throws ApplicationException {
+  public ArrayList<ExtElement> readExternalElements(ExtElement element) throws ApplicationException {
+    return readDBExternalElements(element);
+  }
+
+  public void createExternalObject(ExtObject object) throws ApplicationException {
+    createDBExternalObject(object);
+  }
+
+  public void updateExternalObject(ExtObject object) throws ApplicationException {
+    updateDBExternalObject(object);
+  }
+
+  public void deleteExternalObject(ExtObject object) throws ApplicationException {
+    deleteDBExternalObject(object);
+  }
+
+  public ArrayList<ExtObject> readExternalObjects(ExtObject object) throws ApplicationException {
+    return readDBExternalObjects(object);
+  }
+
+  private void createDBExternalElement(ExtElement element) throws ApplicationException {
     try {
-      String keyStr = element.getDocumentId() + "###" + element.getPageNumber();
+      test(element);
+      String content = element.getContent();
       String valueStr = element.getXmlString();
+      if (content == null)
+        throw new ApplicationException("External object: no content element specified in: " + valueStr);
+      Date now = new Date();
+      element.setModificationDate(now);
+      String docId = element.getDocumentId();
+      String pageNumber = element.getPageNumber();
+      String keyStr = docId + "###" + pageNumber;
       DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
       DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8"));
       Database elementDB = dbEnvExternalObjects.getElementDB();
@@ -59,21 +89,92 @@
     }
   }

-  private void deleteDBExternalElement(ExtElement element) throws ApplicationException {
+  private void updateDBExternalElement(ExtElement element) throws ApplicationException {
+    test(element);
+    String content = element.getContent();
+    String elementXmlStr = element.getXmlString();
+    if (content == null)
+      throw new ApplicationException("External object: no content element specified in: " + elementXmlStr);
+    Date now = new Date();
+    element.setModificationDate(now);
+    String docId = element.getDocumentId();
+    String pageNumber = element.getPageNumber();
+    String uid = element.getUid();
+    String xmlNodeId = element.getXmlNodeId();
+    String hashKey = docId + "###" + pageNumber;
+    boolean updated = false;
     try {
-      String keyStr = element.getDocumentId() + "###" + element.getPageNumber();
-      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
       Database elementDB = dbEnvExternalObjects.getElementDB();
-      elementDB.delete(null, dbEntryKey);
+      Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null);
+      Cursor cursor = elementDB.openCursor(t, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS && ! updated) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtElement elem = ExtElement.parseXmlStr(foundValueStr);
+        String elemUid = elem.getUid();
+        String elemXmlNodeId = elem.getXmlNodeId();
+        if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) {
+          cursor.delete();
+          byte[] elementXmlStrBytes = elementXmlStr.getBytes("utf-8");
+          DatabaseEntry dbEntryValue = new DatabaseEntry(elementXmlStrBytes);
+          cursor.put(dbEntryKey, dbEntryValue);
+          updated = true;
+          break;
+        }
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+      t.commit();
     } catch (DatabaseException e) {
       throw new ApplicationException(e);
     } catch (UnsupportedEncodingException e) {
       throw new ApplicationException(e);
     }
-  }
-
-  private ArrayList<ExtElement> readDBExternalElements(String documentId, String pageNumber) throws ApplicationException {
+  }
+
+  private void deleteDBExternalElement(ExtElement element) throws ApplicationException {
+    test(element);
+    String docId = element.getDocumentId();
+    String pageNumber = element.getPageNumber();
+    String uid = element.getUid();
+    String xmlNodeId = element.getXmlNodeId();
+    String hashKey = docId + "###" + pageNumber;
+    try {
+      Database elementDB = dbEnvExternalObjects.getElementDB();
+      Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null);
+      Cursor cursor = elementDB.openCursor(t, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtElement elem = ExtElement.parseXmlStr(foundValueStr);
+        String elemUid = elem.getUid();
+        String elemXmlNodeId = elem.getXmlNodeId();
+        if (uid.equals(elemUid) && xmlNodeId.equals(elemXmlNodeId)) {
+          cursor.delete();
+        }
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+      t.commit();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private ArrayList<ExtElement> readDBExternalElements(ExtElement element) throws ApplicationException {
     ArrayList<ExtElement> retElements = new ArrayList<ExtElement>();
+    String documentId = element.getDocumentId();
+    String pageNumber = element.getPageNumber();
     String hashKey = documentId + "###" + pageNumber;
     try {
       Database elementDB = dbEnvExternalObjects.getElementDB();
@@ -98,6 +199,165 @@
     return retElements;
   }

+  private void test(ExtElement element) throws ApplicationException {
+    String uid = element.getUid();
+    String docId = element.getDocumentId();
+    String xmlNodeId = element.getXmlNodeId();
+    String pageNumber = element.getPageNumber();
+    String xmlStr = element.getXmlString();
+    if (uid == null)
+      throw new ApplicationException("External object: no attribute \"uid\" specified in: " + xmlStr);
+    if (docId == null)
+      throw new ApplicationException("External object: no attribute \"documentId\" specified in: " + xmlStr);
+    if (xmlNodeId == null)
+      throw new ApplicationException("External object: no attribute \"xmlNodeId\" specified in: " + xmlStr);
+    if (pageNumber == null)
+      throw new ApplicationException("External object: no attribute \"pageNumber\" specified in: " + xmlStr);
+  }
+
+  private void createDBExternalObject(ExtObject extObject) throws ApplicationException {
+    try {
+      test(extObject);
+      Date now = new Date();
+      extObject.setModificationDate(now);
+      String type = extObject.getType();
+      String uid = extObject.getUid();
+      String docId = extObject.getDocumentId();
+      if (docId == null || docId.isEmpty())
+        docId = "-1";
+      String keyStr = type + "###" + uid + "###" + docId;
+      String valueStr = extObject.getXmlString();
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8"));
+      Database objectDB = dbEnvExternalObjects.getObjectDB();
+      objectDB.put(null, dbEntryKey, dbEntryValue);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void updateDBExternalObject(ExtObject object) throws ApplicationException {
+    test(object);
+    String content = object.getContent();
+    String elementXmlStr = object.getXmlString();
+    if (content == null)
+      throw new ApplicationException("External object: no content element specified in: " + elementXmlStr);
+    Date modificationDate = object.getModificationDate();
+    Date now = new Date();
+    object.setModificationDate(now);
+    String type = object.getType();
+    String uid = object.getUid();
+    String docId = object.getDocumentId();
+    String hashKey = type + "###" + uid + "###" + docId;
+    boolean updated = false;
+    try {
+      Database objectDB = dbEnvExternalObjects.getObjectDB();
+      Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null);
+      Cursor cursor = objectDB.openCursor(t, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS && ! updated) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtObject obj = object.getInstance(foundValueStr);
+        Date objModificationDate = obj.getModificationDate();
+        if (modificationDate.equals(objModificationDate)) {
+          cursor.delete();
+          byte[] elementXmlStrBytes = elementXmlStr.getBytes("utf-8");
+          DatabaseEntry dbEntryValue = new DatabaseEntry(elementXmlStrBytes);
+          cursor.put(dbEntryKey, dbEntryValue);
+          updated = true;
+          break;
+        }
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+      t.commit();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void deleteDBExternalObject(ExtObject object) throws ApplicationException {
+    test(object);
+    Date modificationDate = object.getModificationDate();
+    Date now = new Date();
+    object.setModificationDate(now);
+    String type = object.getType();
+    String uid = object.getUid();
+    String docId = object.getDocumentId();
+    String hashKey = type + "###" + uid + "###" + docId;
+    try {
+      Database objectDB = dbEnvExternalObjects.getObjectDB();
+      Transaction t = dbEnvExternalObjects.getEnv().beginTransaction(null, null);
+      Cursor cursor = objectDB.openCursor(t, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtObject obj = object.getInstance(foundValueStr);
+        Date objModificationDate = obj.getModificationDate();
+        if (modificationDate == null || modificationDate.equals(objModificationDate)) {
+          cursor.delete();
+        }
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+      t.commit();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private ArrayList<ExtObject> readDBExternalObjects(ExtObject object) throws ApplicationException {
+    ArrayList<ExtObject> retElements = new ArrayList<ExtObject>();
+    String type = object.getType();
+    String uid = object.getUid();
+    String docId = object.getDocumentId();
+    String hashKey = type + "###" + uid + "###" + docId;
+    try {
+      Database objectDB = dbEnvExternalObjects.getObjectDB();
+      Cursor cursor = objectDB.openCursor(null, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtObject obj = object.getInstance(foundValueStr);
+        retElements.add(obj);
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return retElements;
+  }
+
+  private void test(ExtObject object) throws ApplicationException {
+    String uid = object.getUid();
+    String xmlStr = object.getXmlString();
+    if (uid == null)
+      throw new ApplicationException("External object: no attribute \"uid\" specified in: " + xmlStr);
+  }
+
+
+
   private void init() throws ApplicationException {
     dbEnvExternalObjects = new DbEnvExternalObjects();
     dbEnvExternalObjects.setDataDir(DB_DIR_EXTERNAL_OBJECTS);
@@ -109,8 +369,9 @@
     getInstance();
     instance.beginOperation();
     System.out.print("Start ...");
-    // instance.deleteSampleData();
-    // instance.writeSampleData();
+    instance.deleteSampleData();
+    instance.createSampleData();
+    // instance.updateSampleData();
     instance.readSampleData();
     instance.end();
     instance.endOperation();
@@ -120,53 +381,125 @@
   }

   private void deleteSampleData() throws ApplicationException {
-    ExtElement e = new ExtElement();
-    e.setUid("joe");
-    e.setDocumentId("/archimedes/it/l223.xml");
-    e.setPageNumber("17");
-    deleteExternalElement(e);
+    String xmlNodeId1 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
+    String objectXmlStr1 =
+      "<object type=\"" + "element" + "\" " +
+              "uid=\"" + "joe" + "\" " +
+              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " +
+              "pageNumber=\"" + "17" + "\" " +
+              "xmlNodeId=\"" + xmlNodeId1 + "\"" +
+              ">" +
+       "</object>";
+    ExtElement e1 = ExtElement.parseXmlStr(objectXmlStr1);
+    String xmlNodeId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]";
+    String objectXmlStr2 =
+      "<object type=\"" + "element" + "\" " +
+              "uid=\"" + "michael" + "\" " +
+              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " +
+              "pageNumber=\"" + "17" + "\" " +
+              "xmlNodeId=\"" + xmlNodeId2 + "\"" +
+              ">" +
+       "</object>";
+    ExtElement e2 = ExtElement.parseXmlStr(objectXmlStr2);
+    deleteExternalElement(e1);
+    deleteExternalElement(e2);
+
+    ExtQuery q = new ExtQuery();
+    q.setUid("joe");
+    q.setDocumentId("/archimedes/it/l223.xml");
+    ArrayList<ExtObject> objects = readExternalObjects(q);
+    for (int i=0; i<objects.size(); i++) {
+      ExtObject o = objects.get(i);
+      deleteExternalObject(o);
+    }
+
+    ExtQuery q2 = new ExtQuery();
+    q2.setUid("michael");
+    q2.setDocumentId("/archimedes/it/l223.xml");
+    objects = readExternalObjects(q2);
+    for (int i=0; i<objects.size(); i++) {
+      ExtObject o = objects.get(i);
+      deleteExternalObject(o);
+    }
   }

-  private void writeSampleData() throws ApplicationException {
+  private void createSampleData() throws ApplicationException {
     Date now = new Date();

-    String sId = "1.2.2.2.2.5";
+    String sId = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
     ExtElement e = new ExtElement();
     e.setUid("joe");
     e.setModificationDate(now);
     e.setDocumentId("/archimedes/it/l223.xml");
     e.setPageNumber("17");
     e.setXmlNodeId(sId);
-    e.setContent("<note>This is a test note to sentence " + sId + "</note>");
-    writeExternalElement(e);
+    e.setContent("<note>This is a test note to element " + sId + " with <seg xlink:href=\"http://slime.de\">this external link</seg>" + "</note>");
+    createExternalElement(e);

     ExtElement e2 = new ExtElement();
-    String sId2 = "1.2.2.2.2.7";
+    String sId2 = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[4]";
     e2.setUid("michael");
     e2.setModificationDate(now);
     e2.setDocumentId("/archimedes/it/l223.xml");
     e2.setPageNumber("17");
     e2.setXmlNodeId(sId2);
     e2.setCharPos("18");
-    e2.setContent("<note>This is a test note to sentence " + sId2 + "</note>");
-    writeExternalElement(e2);
+    e2.setContent("<note>This is a test note to element " + sId2 + "</note>");
+    createExternalElement(e2);

-    /*
-    String sId3 = "1.2.2.2.2.8.15.3.3";
-    e3.setUid("joe");
-    e3.setModificationDate(now);
-    e3.setDocumentId("/archimedes/it/l223.xml");
-    e3.setPageNumber("17");
-    e3.setXmlNodeId(sId3);
-    e2.setContent("<note>This is an external test note to sentence " + sId3 + "</note>");
-    writeExternalElement(e3);
-     */
+    ExtQuery q1 = new ExtQuery();
+    q1.setUid("joe");
+    q1.setDocumentId("/archimedes/it/l223.xml");
+    q1.setQueryType("fulltext");
+    q1.setQueryName("seminario");
+    createExternalObject(q1);
+
+    ExtQuery q2 = new ExtQuery();
+    q2.setUid("michael");
+    q2.setDocumentId("/archimedes/it/l223.xml");
+    q2.setQueryType("url");
+    String url = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/archimedes/it/l223.xml&pn=17&mode=text&query-type=fulltextMorph&query=seminario&query-result-pn=1";
+    String urlDeresolved = StringUtilEscapeChars.deresolveXmlEntities(url);
+    q2.setQueryName(urlDeresolved);
+    createExternalObject(q2);

   }

+  private void updateSampleData() throws ApplicationException {
+    Date now = new Date();
+    String xmlNodeId = "/archimedes[1]/text[1]/body[1]/chap[1]/p[1]/s[2]";
+    String objectXmlStr =
+      "<object type=\"" + "element" + "\" " +
+              "uid=\"" + "joe" + "\" " +
+              "documentId=\"" + "/archimedes/it/l223.xml" + "\" " +
+              "pageNumber=\"" + "17" + "\" " +
+              "xmlNodeId=\"" + xmlNodeId + "\"" +
+              ">" +
+          "<content>" + "<note>This is a test note to element " + xmlNodeId + " with <seg xlink:href=\"http://slime.de\">this external link</seg>" + "</note>" + "</content>" +
+       "</object>";
+    ExtElement e = ExtElement.parseXmlStr(objectXmlStr);
+    e.setModificationDate(now);
+    updateExternalElement(e);
+  }
+
   private void readSampleData() throws ApplicationException {
-    ArrayList<ExtElement> elements = readExternalElements("/archimedes/it/l223.xml", "17");
+    ExtElement elem = new ExtElement();
+    elem.setDocumentId("/archimedes/it/l223.xml");
+    elem.setPageNumber("17");
+    ArrayList<ExtElement> elements = readExternalElements(elem);
     System.out.println(elements);
+
+    ExtQuery q1 = new ExtQuery();
+    q1.setUid("joe");
+    q1.setDocumentId("/archimedes/it/l223.xml");
+    ArrayList<ExtObject> objects = readExternalObjects(q1);
+    System.out.println(objects);
+
+    ExtQuery q2 = new ExtQuery();
+    q2.setUid("michael");
+    q2.setDocumentId("/archimedes/it/l223.xml");
+    objects = readExternalObjects(q2);
+    System.out.println(objects);
   }

   private void end() throws ApplicationException {
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Tue Feb 22 16:03:45 2011 +0100
@@ -5,7 +5,15 @@
 import java.util.ArrayList;

 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
-import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAR;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexDE;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEL;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEN;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexFR;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexIT;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexLA;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexNL;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexZH;
 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization;
 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager;
 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
@@ -1014,30 +1022,89 @@
   }

   private String normalize4HumanReaders(String s) {
-    String normStr = s;
-    StringReader strReader = new StringReader(normStr + "\n");
-    MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader);
-    if (Language.getInstance().isLatin(language)) {
-      mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA);
-    } else if (Language.getInstance().isChinese(language)) {
-      mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH);
-    } else {
-      // TODO normalization for all languages
-      return normalize4Lexica(s, null);  // old function
-    }
+    StringReader strReader = new StringReader(s + "\n");
     String retStr = "";
     String token = "";
-    while (token != null) {
-      try {
-        token = mpdlNormalizerLexAll.yylex();
-        if (token != null)
-          retStr += token;
-      } catch (IOException e ) {
-        // nothing cause IOException is not needed for a StringReader
+    try {
+      if (Language.getInstance().isLatin(language)) {
+        MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isArabic(language)) {
+        MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isGerman(language)) {
+        MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isGreek(language)) {
+        MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isEnglish(language)) {
+        MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isFrench(language)) {
+        MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isItalian(language)) {
+        MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isDutch(language)) {
+        MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else if (Language.getInstance().isChinese(language)) {
+        MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader);
+        mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP);
+        while (token != null) {
+          token = mpdlNormalizerLex.yylex();
+          if (token != null)
+            retStr += token;
+        }
+      } else {
+        return normalize4Lexica(s, null);  // old function
       }
+    } catch (IOException e ) {
+      // nothing cause IOException is not needed for a StringReader
     }
-    normStr = retStr;
-    return normStr;
+    return retStr;
   }

   /*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,572 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:02 */
+
+/*
+ * Normalization rules for Arabic text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:02 from the specification file
+ * <tt>MpdlNormalizerLexAR.lex</tt>
+ */
+public class MpdlNormalizerLexAR {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  1, 1
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\2\uffbf\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\3\0\1\1\1\2\1\3\1\4";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[7];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\3\0\6\0\11\0\11\0\11\0\11";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[7];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\4\1\0\1\5\1\4\1\6\1\5\1\4\1\7"+
+    "\1\5\3\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[12];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\3\0\4\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[7];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexAR(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexAR(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 10) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 4:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 5: break;
+        case 2:
+          { problem = 1; add(yytext());
+          }
+        case 6: break;
+        case 3:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 7: break;
+        case 1:
+          { add(yytext());
+          }
+        case 8: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,74 @@
+/*
+ * Normalization rules for Arabic text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAR
+%type java.lang.String
+%unicode
+
+// Arabic: ar
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+%%
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+AR: fehlt noch
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,629 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+
+/*
+ * Normalization rules for German text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:03 from the specification file
+ * <tt>MpdlNormalizerLexDE.lex</tt>
+ */
+public class MpdlNormalizerLexDE {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int CELEX = 8;
+  public static final int DISP = 2;
+  public static final int GRIMM = 10;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  1,  1,  3,  3,  4, 4
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\15\32\2\6\0\1\6\15\2\1\10\5\2"+
+    "\1\4\5\2\111\0\1\11\21\0\1\12\5\0\1\13\2\0\1\14"+
+    "\4\0\1\11\21\0\1\12\5\0\1\13\202\0\1\3\u01e4\0\1\7"+
+    "\1\0\1\5\ufc99\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\5\0\1\1\1\2\1\3\1\4\3\1\1\5\3\1"+
+    "\1\6\1\7\1\10\1\11\1\12\1\13\1\14\1\15"+
+    "\1\16";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[25];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\16\0\34\0\52\0\70\0\106\0\106\0\106"+
+    "\0\106\0\124\0\142\0\160\0\106\0\176\0\214\0\232"+
+    "\0\106\0\106\0\106\0\106\0\106\0\106\0\106\0\106"+
+    "\0\106";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[25];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\6\1\0\1\6\1\7\11\6\1\10\1\6\1\11"+
+    "\1\6\1\7\1\12\1\6\1\13\1\6\1\14\4\6"+
+    "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+
+    "\1\6\1\14\4\6\2\10\1\15\1\6\1\7\1\16"+
+    "\1\10\1\17\1\10\1\20\1\21\1\22\1\23\1\24"+
+    "\1\10\1\6\1\15\1\6\1\7\1\12\1\6\1\13"+
+    "\1\6\1\14\3\6\1\25\1\10\23\0\1\26\1\0"+
+    "\1\27\15\0\1\30\15\0\1\31\13\0\1\26\1\0"+
+    "\1\23\15\0\1\21\15\0\1\22\6\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[168];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\5\0\4\11\3\1\1\11\3\1\11\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[25];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexDE(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexDE(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 66) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 10:
+          { add("sz");
+          }
+        case 15: break;
+        case 3:
+          { problem = 1; add(yytext());
+          }
+        case 16: break;
+        case 6:
+          { add("ae");
+          }
+        case 17: break;
+        case 2:
+          { add("s");
+          }
+        case 18: break;
+        case 4:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 19: break;
+        case 12:
+          { add("ü");
+          }
+        case 20: break;
+        case 8:
+          { add("ue");
+          }
+        case 21: break;
+        case 11:
+          { add("u");
+          }
+        case 22: break;
+        case 13:
+          { add("ä");
+          }
+        case 23: break;
+        case 1:
+          { add(yytext());
+          }
+        case 24: break;
+        case 9:
+          { add("ss");
+          }
+        case 25: break;
+        case 7:
+          { add("oe");
+          }
+        case 26: break;
+        case 14:
+          { add("ö");
+          }
+        case 27: break;
+        case 5:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 28: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexDE.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,117 @@
+/*
+ * Normalization rules for German text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexDE
+%type java.lang.String
+%unicode
+
+// German: de, deu, ger
+
+%states DISP, DICT, SEARCH
+%state CELEX, GRIMM
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+%%
+
+ſ { add("s"); }
+
+// Fraktur
+
+<DISP, DICT, SEARCH,
+GRIMM> {
+
+uͦ {add("u"); }
+aͤ {add("ä"); }
+oͤ {add("ö"); }
+uͤ {add("ü"); }
+
+}
+
+<CELEX> {
+
+// normalize ä ö ü ß only for Celex!
+
+ä | Ä | aͤ { add("ae"); }
+ö | Ö | oͤ { add("oe"); }
+ü | Ü | uͤ { add("ue"); }
+uͦ {add("u"); }
+ß { add("ss"); }
+
+{Alphabet} { add(yytext()); }
+
+. { problem = 1; add(yytext()); }
+
+}
+
+<GRIMM> {
+
+ß { add("sz"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT, CELEX, GRIMM> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+DE: Trennung von Deutsch und Fraktur?
+DE: Celex: hyphens weg?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,687 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+
+/*
+ * Normalization rules for Greek text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:03 from the specification file
+ * <tt>MpdlNormalizerLexEL.lex</tt>
+ */
+public class MpdlNormalizerLexEL {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  3, 3
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\3\32\3\6\0\32\3\u0331\0\1\4\1\5"+
+    "\1\6\1\7\15\0\1\2\3\0\2\2\11\0\1\10\1\11\1\12"+
+    "\u1ba1\0\1\13\1\0\1\15\1\0\1\16\1\0\1\20\1\0\1\21"+
+    "\1\0\1\22\1\0\1\23\65\0\1\14\17\0\1\17\57\0\1\24"+
+    "\ue00d\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\12\1\1\14\1\0\1\15"+
+    "\1\0\1\16\1\0\1\17\1\0\1\20\1\0\1\21"+
+    "\1\0\1\22\1\0\1\23\1\0\1\24\1\0\1\25"+
+    "\1\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[45];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\25\0\52\0\77\0\124\0\124\0\124\0\124"+
+    "\0\124\0\124\0\124\0\124\0\124\0\124\0\124\0\151"+
+    "\0\176\0\223\0\250\0\275\0\322\0\347\0\374\0\u0111"+
+    "\0\u0126\0\124\0\u013b\0\124\0\u0150\0\124\0\u0165\0\124"+
+    "\0\u017a\0\124\0\u018f\0\124\0\u01a4\0\124\0\u01b9\0\124"+
+    "\0\u01ce\0\124\0\u01e3\0\124\0\u01f8";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[45];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\5\1\0\24\5\1\6\1\5\1\7\1\10\1\11"+
+    "\1\12\1\13\1\14\1\15\1\16\13\5\1\17\1\5"+
+    "\1\7\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+    "\1\30\1\31\1\5\1\6\1\5\1\7\1\10\1\11"+
+    "\1\12\1\13\1\14\1\15\1\16\1\20\1\21\1\22"+
+    "\1\23\1\24\1\25\1\26\1\27\1\30\1\31\26\0"+
+    "\1\32\1\33\23\0\1\34\1\35\23\0\1\36\1\37"+
+    "\23\0\1\40\1\41\23\0\1\42\1\43\23\0\1\44"+
+    "\1\45\23\0\1\46\1\47\23\0\1\50\1\51\23\0"+
+    "\1\52\1\53\23\0\1\54\1\55\23\0\1\32\24\0"+
+    "\1\34\24\0\1\36\24\0\1\40\24\0\1\42\24\0"+
+    "\1\44\24\0\1\46\24\0\1\50\24\0\1\52\24\0"+
+    "\1\54\23\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[525];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\4\0\13\11\12\1\1\11\1\0\1\11\1\0\1\11"+
+    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
+    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[45];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexEL(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexEL(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 82) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 21:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ῴ");
+          }
+        case 22: break;
+        case 6:
+          { add("ή");
+          }
+        case 23: break;
+        case 15:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ή");
+          }
+        case 24: break;
+        case 7:
+          { add("ί");
+          }
+        case 25: break;
+        case 1:
+          { add(yytext());
+          }
+        case 26: break;
+        case 20:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ώ");
+          }
+        case 27: break;
+        case 17:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ί");
+          }
+        case 28: break;
+        case 13:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ᾴ");
+          }
+        case 29: break;
+        case 8:
+          { add("ό");
+          }
+        case 30: break;
+        case 12:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ά");
+          }
+        case 31: break;
+        case 9:
+          { add("ύ");
+          }
+        case 32: break;
+        case 3:
+          { problem = 1; add(yytext());
+          }
+        case 33: break;
+        case 18:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ό");
+          }
+        case 34: break;
+        case 4:
+          { add("ά");
+          }
+        case 35: break;
+        case 2:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 36: break;
+        case 10:
+          { add("ώ");
+          }
+        case 37: break;
+        case 14:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("έ");
+          }
+        case 38: break;
+        case 16:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ῄ");
+          }
+        case 39: break;
+        case 5:
+          { add("έ");
+          }
+        case 40: break;
+        case 11:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 41: break;
+        case 19:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("ύ");
+          }
+        case 42: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEL.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,123 @@
+/*
+ * Normalization rules for Greek text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexEL
+%type java.lang.String
+%unicode
+
+// Greek: el, grc
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+wordend =  [νρς]? {END}
+
+Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+
+%%
+
+<DISP, DICT, SEARCH> {
+
+// replace tonos by oxia
+// (although this should really be corrected in the text rather than normalized)
+ά { add("ά"); }
+έ { add("έ"); }
+ή { add("ή"); }
+ί { add("ί"); }
+ό { add("ό"); }
+ύ { add("ύ"); }
+ώ { add("ώ"); }
+
+}
+
+<DICT, SEARCH> {
+
+ὰ / {wordend} { add("ά"); }
+ᾲ / {wordend} { add("ᾴ"); }
+ὲ / {wordend} { add("έ"); }
+ὴ / {wordend} { add("ή"); }
+ῂ / {wordend} { add("ῄ"); }
+ὶ / {wordend} { add("ί"); }
+ὸ / {wordend} { add("ό"); }
+ὺ / {wordend} { add("ύ"); }
+ὼ / {wordend} { add("ώ"); }
+ῲ / {wordend} { add("ῴ"); }
+
+// other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ
+
+}
+
+<DISP, DICT, SEARCH> {
+
+@ { problem = 1; add(yytext()); }
+{Latin} { problem = 1; add(yytext()); }
+
+}
+
+
+// default
+
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss?
+EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann?
+EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden?
+EL: neuer State BETACODE ?
+EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,576 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+
+/*
+ * Normalization rules for English text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:03 from the specification file
+ * <tt>MpdlNormalizerLexEN.lex</tt>
+ */
+public class MpdlNormalizerLexEN {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  1, 1
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\3\0\1\1\1\2\1\3\1\4\1\5";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+
+    "\1\4\1\10\1\7\1\5\4\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[16];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\3\0\5\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexEN(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexEN(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 14) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 5:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 6: break;
+        case 2:
+          { problem = 1; add(yytext());
+          }
+        case 7: break;
+        case 4:
+          { add("s");
+          }
+        case 8: break;
+        case 3:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 9: break;
+        case 1:
+          { add(yytext());
+          }
+        case 10: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexEN.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,83 @@
+/*
+ * Normalization rules for English text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexEN
+%type java.lang.String
+%unicode
+
+// 1.5 English: en
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+%%
+
+<DISP, DICT, SEARCH> {
+
+ſ { add("s"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+EN: vollständig?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,621 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+
+/*
+ * Normalization rules for French text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:03 from the specification file
+ * <tt>MpdlNormalizerLexFR.lex</tt>
+ */
+public class MpdlNormalizerLexFR {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int CELEX = 8;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  1,  1,  3, 3
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\15\32\2\6\0\32\2\144\0\1\4\3\7"+
+    "\3\0\1\5\1\0\3\10\1\0\3\11\3\0\3\12\4\0\3\13"+
+    "\126\0\2\6\53\0\1\3\u1e99\0\1\14\udfe6\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[18];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\16\0\34\0\52\0\70\0\70\0\70\0\70"+
+    "\0\70\0\70\0\70\0\70\0\70\0\70\0\70\0\70"+
+    "\0\70\0\70";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[18];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\5\1\0\13\5\1\6\1\5\1\7\1\5\1\10"+
+    "\1\11\1\12\7\5\1\6\1\5\1\13\1\5\1\10"+
+    "\1\11\1\12\7\5\2\6\1\13\1\5\1\10\1\11"+
+    "\1\12\1\14\1\15\1\16\1\17\1\20\1\21\1\22"+
+    "\1\6\16\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[70];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\4\0\16\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[18];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexFR(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexFR(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 54) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 2:
+          { problem = 1; add(yytext());
+          }
+        case 15: break;
+        case 6:
+          { add("ae");
+          }
+        case 16: break;
+        case 4:
+          { add("s");
+          }
+        case 17: break;
+        case 12:
+          { add("o");
+          }
+        case 18: break;
+        case 3:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 19: break;
+        case 13:
+          { add("u");
+          }
+        case 20: break;
+        case 1:
+          { add(yytext());
+          }
+        case 21: break;
+        case 11:
+          { add("i");
+          }
+        case 22: break;
+        case 14:
+          { add("");
+          }
+        case 23: break;
+        case 10:
+          { add("e");
+          }
+        case 24: break;
+        case 9:
+          { add("a");
+          }
+        case 25: break;
+        case 5:
+          { add("ss");
+          }
+        case 26: break;
+        case 8:
+          { add("oe");
+          }
+        case 27: break;
+        case 7:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 28: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexFR.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,104 @@
+/*
+ * Normalization rules for French text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexFR
+%type java.lang.String
+%unicode
+
+// French: fr
+
+%states DISP, DICT, SEARCH
+%state CELEX
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+%%
+
+<DISP, DICT, SEARCH, CELEX> {
+
+ſ { add("s"); }
+ß { add("ss"); }
+æ { add("ae"); }
+
+}
+
+<CELEX> {
+
+[œŒ] { add("oe"); }
+[áàâ] { add("a"); }
+[éèê] { add("e"); }
+[íìî] { add("i"); }
+[óòô] { add("o"); }
+[úùû] { add("u"); }
+’ { add(""); }
+
+{Alphabet} { add(yytext()); }
+
+. { problem = 1; add(yytext()); } // in particular "@"
+
+}
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT, CELEX> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+FR: richtig? vollständig?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,874 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:03 */
+
+/*
+ * Normalization rules for Italian text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:03 from the specification file
+ * <tt>MpdlNormalizerLexIT.lex</tt>
+ */
+public class MpdlNormalizerLexIT {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  2,  3,  4,  5, 6
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\5\42\0\1\4\22\0\1\51\1\1\3\2\1\1\3\2"+
+    "\1\40\1\0\1\2\1\3\2\2\1\41\1\2\1\47\1\3\1\2"+
+    "\1\37\1\44\1\50\2\2\1\0\1\2\6\0\1\43\3\2\1\11"+
+    "\2\2\1\42\1\6\1\35\1\2\1\3\1\2\1\7\1\36\1\13"+
+    "\1\45\1\12\1\2\1\10\1\15\1\46\2\2\1\0\1\2\62\0"+
+    "\1\4\22\0\1\16\5\0\1\32\1\0\1\17\3\0\1\20\5\0"+
+    "\1\21\6\0\1\22\5\0\1\30\1\23\5\0\1\31\1\0\1\24"+
+    "\3\0\1\25\5\0\1\26\6\0\1\27\37\0\1\1\70\0\1\34"+
+    "\1\33\53\0\1\14\ufe80\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\11\0\1\1\1\2\2\3\1\4\1\5\1\2\1\3"+
+    "\1\6\1\2\1\7\1\10\1\11\1\12\1\13\5\3"+
+    "\1\14\1\2\1\3\1\6\1\2\1\15\1\16\1\17"+
+    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+    "\1\30\4\0\1\31\1\32\1\0\1\33\1\0\1\34"+
+    "\1\35\1\0\1\36\1\37\1\40\4\0\1\41\5\0"+
+    "\1\42\1\43\2\0\1\44\1\0\1\45\5\0\1\44"+
+    "\1\46\3\0\1\47";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\52\0\124\0\176\0\250\0\322\0\374\0\u0126"+
+    "\0\u0150\0\0\0\0\0\0\0\u017a\0\0\0\0\0\u01a4"+
+    "\0\u01ce\0\0\0\u01f8\0\0\0\0\0\0\0\0\0\0"+
+    "\0\u0222\0\u024c\0\u0276\0\u02a0\0\u02ca\0\0\0\u02f4\0\u031e"+
+    "\0\u0348\0\u0372\0\u039c\0\0\0\0\0\0\0\0\0\0"+
+    "\0\0\0\0\0\0\0\0\0\0\0\0\0\u03c6\0\u03f0"+
+    "\0\u041a\0\0\0\0\0\0\0\u0444\0\0\0\u046e\0\0"+
+    "\0\0\0\u0498\0\0\0\0\0\0\0\u04c2\0\u04ec\0\u0516"+
+    "\0\u0540\0\0\0\u056a\0\u0594\0\u05be\0\u05e8\0\u0612\0\0"+
+    "\0\0\0\u063c\0\u031e\0\u0666\0\u0690\0\0\0\u06ba\0\u06e4"+
+    "\0\u070e\0\0\0\u0738\0\0\0\0\0\u0762\0\u078c\0\u07b6"+
+    "\0\0";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\52\0\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+    "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\5\12"+
+    "\2\13\1\12\2\13\1\24\1\25\1\26\1\27\1\30"+
+    "\1\12\1\13\1\31\2\13\1\14\1\13\1\23\1\32"+
+    "\1\33\1\34\1\35\1\36\1\12\1\13\1\14\1\15"+
+    "\1\16\1\17\1\37\1\14\1\21\1\13\1\15\1\40"+
+    "\1\41\1\42\5\12\2\13\1\12\2\13\1\24\1\25"+
+    "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\43"+
+    "\1\13\1\42\1\32\1\33\1\34\1\35\1\36\1\12"+
+    "\1\13\1\14\1\15\1\16\1\44\1\20\1\14\1\21"+
+    "\1\13\1\15\1\14\1\22\1\23\1\45\1\46\1\47"+
+    "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\24"+
+    "\1\25\1\26\1\27\1\30\1\12\1\13\1\31\2\13"+
+    "\1\14\1\13\1\23\1\32\1\33\1\34\1\35\1\36"+
+    "\1\12\1\13\1\14\1\15\1\16\1\44\1\37\1\14"+
+    "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+
+    "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+
+    "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+
+    "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+
+    "\1\36\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+    "\1\14\1\21\1\13\1\15\1\14\1\22\1\23\1\45"+
+    "\1\46\1\47\1\50\1\51\1\52\1\53\1\54\1\55"+
+    "\1\56\1\24\1\25\1\26\1\27\1\30\1\12\1\13"+
+    "\1\31\2\13\1\14\1\13\1\23\1\32\1\33\1\34"+
+    "\1\35\1\36\1\12\1\13\1\14\1\15\1\16\1\17"+
+    "\1\37\1\14\1\21\1\13\1\15\1\40\1\41\1\42"+
+    "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+
+    "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+
+    "\1\13\1\31\2\13\1\43\1\13\1\42\1\32\1\33"+
+    "\1\34\1\35\1\36\6\0\1\57\4\0\1\60\1\61"+
+    "\41\0\1\62\113\0\1\63\1\0\1\63\36\0\1\64"+
+    "\22\0\1\65\44\0\1\66\4\0\1\66\2\0\1\66"+
+    "\3\0\1\66\5\0\2\66\1\0\2\66\1\0\3\66"+
+    "\2\0\1\66\1\0\2\66\1\0\2\66\45\0\1\67"+
+    "\57\0\1\70\5\0\2\71\1\72\2\0\2\71\1\0"+
+    "\3\71\13\0\1\71\6\0\1\71\2\0\1\71\2\0"+
+    "\4\71\47\0\1\73\1\0\1\74\3\0\2\75\1\76"+
+    "\2\0\2\75\1\0\3\75\13\0\1\75\6\0\1\75"+
+    "\2\0\1\75\2\0\4\75\10\0\1\77\25\0\1\64"+
+    "\25\0\1\100\51\0\1\100\3\0\1\101\35\0\1\102"+
+    "\4\0\1\102\2\0\1\102\3\0\1\102\5\0\2\102"+
+    "\1\0\2\102\1\0\3\102\2\0\1\102\1\0\2\102"+
+    "\1\0\2\102\43\0\1\103\4\0\1\104\15\0\1\105"+
+    "\53\0\1\106\51\0\1\106\3\0\1\107\72\0\1\110"+
+    "\54\0\1\111\12\0\2\71\3\0\2\71\1\0\3\71"+
+    "\13\0\1\71\6\0\1\71\2\0\1\71\2\0\4\71"+
+    "\3\0\2\75\3\0\2\75\1\0\3\75\13\0\1\75"+
+    "\6\0\1\75\2\0\1\75\2\0\4\75\5\0\1\112"+
+    "\3\0\1\113\53\0\1\114\43\0\1\115\6\0\1\113"+
+    "\43\0\1\116\51\0\1\116\1\117\1\120\46\0\1\121"+
+    "\3\0\1\60\53\0\1\122\43\0\1\123\6\0\1\60"+
+    "\46\0\1\113\45\0\1\124\60\0\1\113\43\0\1\125"+
+    "\50\0\1\126\2\0\1\127\52\0\1\60\54\0\1\60"+
+    "\45\0\1\127\100\0\1\130\20\0\1\131\44\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[2016];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\10\7\0\1\1\3\11\1\1\2\11\2\1\1\11"+
+    "\1\1\5\11\5\1\1\11\5\1\13\11\3\0\3\11"+
+    "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+
+    "\5\0\2\11\2\0\1\1\1\0\1\11\3\0\1\11"+
+    "\1\0\2\11\3\0\1\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /** For the backwards DFA of general lookahead statements */
+  private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1];
+
+  /* user code: */
+	private static final int CONS = 1;
+	private static final int VOWEL = 2;
+	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexIT(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexIT(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 168) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      if (zzMarkedPosL > zzStartRead) {
+        switch (zzBufferL[zzMarkedPosL-1]) {
+        case '\n':
+        case '\u000B':
+        case '\u000C':
+        case '\u0085':
+        case '\u2028':
+        case '\u2029':
+          zzAtBOL = true;
+          break;
+        case '\r':
+          if (zzMarkedPosL < zzEndReadL)
+            zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          else if (zzAtEOF)
+            zzAtBOL = false;
+          else {
+            boolean eof = zzRefill();
+            zzMarkedPosL = zzMarkedPos;
+            zzEndReadL = zzEndRead;
+            zzBufferL = zzBuffer;
+            if (eof)
+              zzAtBOL = false;
+            else
+              zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          }
+          break;
+        default:
+          zzAtBOL = false;
+        }
+      }
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      if (zzAtBOL)
+        zzState = ZZ_LEXSTATE[zzLexicalState+1];
+      else
+        zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 32:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = CONS; add("U");
+          }
+        case 40: break;
+        case 15:
+          { add("Á");
+          }
+        case 41: break;
+        case 39:
+          // lookahead expression with fixed lookahead length
+          yypushback(1);
+          { add(yytext());
+          }
+        case 42: break;
+        case 38:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 3;
+          { add(yytext());
+          }
+        case 43: break;
+        case 37:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add(yytext());
+          }
+        case 44: break;
+        case 4:
+          { add(yytext());
+          }
+        case 45: break;
+        case 22:
+          { add("í");
+          }
+        case 46: break;
+        case 9:
+          { cv = VOWEL; add("AE");
+          }
+        case 47: break;
+        case 5:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 48: break;
+        case 29:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = CONS; add("u");
+          }
+        case 49: break;
+        case 20:
+          { add("á");
+          }
+        case 50: break;
+        case 1:
+          { cv = 0; add(yytext());
+          }
+        case 51: break;
+        case 33:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
+          }
+        case 52: break;
+        case 34:
+          { cv = VOWEL; add("zio");
+          }
+        case 53: break;
+        case 11:
+          { cv = VOWEL; add("OE");
+          }
+        case 54: break;
+        case 19:
+          { add("Ú");
+          }
+        case 55: break;
+        case 36:
+          // general lookahead, find correct zzMarkedPos
+          { int zzFState = 7;
+            int zzFPos = zzStartRead;
+            if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; }
+            boolean zzFinL[] = zzFin;
+            while (zzFState != -1 && zzFPos < zzMarkedPos) {
+              if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+              zzInput = zzBufferL[zzFPos++];
+              zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+            }
+            if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+
+            zzFState = 8;
+            zzFPos = zzMarkedPos;
+            while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) {
+              zzInput = zzBufferL[--zzFPos];
+              zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+            };
+            zzMarkedPos = zzFPos;
+          }
+          { cv = VOWEL; add(yytext().replace("ſ", "s"));
+          }
+        case 56: break;
+        case 3:
+          { cv = CONS; add(yytext());
+          }
+        case 57: break;
+        case 31:
+          { cv = CONS; add("QU");
+          }
+        case 58: break;
+        case 16:
+          { add("É");
+          }
+        case 59: break;
+        case 27:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { switch(cv) {
+			case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+			default: cv = VOWEL; add(yytext()); break;
+		}
+          }
+        case 60: break;
+        case 7:
+          { cv = CONS; add("ss");
+          }
+        case 61: break;
+        case 6:
+          { cv = CONS; add("s");
+          }
+        case 62: break;
+        case 35:
+          { cv = VOWEL; add("ZIO");
+          }
+        case 63: break;
+        case 2:
+          { cv = VOWEL; add(yytext());
+          }
+        case 64: break;
+        case 18:
+          { add("Ó");
+          }
+        case 65: break;
+        case 24:
+          { add("ú");
+          }
+        case 66: break;
+        case 30:
+          { cv = CONS; add("Qu");
+          }
+        case 67: break;
+        case 21:
+          { add("é");
+          }
+        case 68: break;
+        case 8:
+          { cv = VOWEL; add("ae");
+          }
+        case 69: break;
+        case 14:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 70: break;
+        case 13:
+          { add("");
+          }
+        case 71: break;
+        case 23:
+          { add("ó");
+          }
+        case 72: break;
+        case 10:
+          { cv = VOWEL; add("oe");
+          }
+        case 73: break;
+        case 28:
+          { cv = CONS; add("qu");
+          }
+        case 74: break;
+        case 12:
+          { problem = 1; add(yytext());
+          }
+        case 75: break;
+        case 25:
+          { switch(cv) {
+			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+			default: cv = CONS; add(yytext()); break;
+		}
+          }
+        case 76: break;
+        case 26:
+          { cv = VOWEL; add("ii");
+          }
+        case 77: break;
+        case 17:
+          { add("Í");
+          }
+        case 78: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,171 @@
+/*
+ * Normalization rules for Italian text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexIT
+%type java.lang.String
+%unicode
+
+// Italian: it, ita
+
+%states DISP, DICT, SEARCH
+
+%{
+	private static final int CONS = 1;
+	private static final int VOWEL = 2;
+	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+Vowel = [AEIOUaeiouÆæęàèòùœ]
+Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR = [lLrR]
+
+
+hyphen = [\u002d\u00ad]  // hyphen and soft hyphen
+X = {hyphen}?
+
+END = \n
+
+prefixCons = (in{X}ter | per | ſu{X}per | ſer)
+
+%%
+
+<DICT, SEARCH> {
+
+À { add("Á"); }
+È { add("É"); }
+Ì { add("Í"); }
+Ò { add("Ó"); }
+Ù { add("Ú"); }
+à { add("á"); }
+è { add("é"); }
+ì { add("í"); }
+ò { add("ó"); }
+ù { add("ú"); }
+
+}
+
+<DISP, DICT, SEARCH> {
+
+ſ { cv = CONS; add("s"); }
+ß { cv = CONS; add("ss"); }
+æ { cv = VOWEL; add("ae"); }
+Æ { cv = VOWEL; add("AE"); }
+œ { cv = VOWEL; add("oe"); }
+Œ { cv = VOWEL; add("OE"); }
+
+ij { cv = VOWEL; add("ii"); }
+
+tio { cv = VOWEL; add("zio"); }
+TIO { cv = VOWEL; add("ZIO"); }
+
+// h-Regeln aus Arboreal:
+^ ha / {END} { add(yytext()); }
+^ hai / {END} { add(yytext()); }
+^ han{X}no / {END} { add(yytext()); }
+^ ho / {END} { add(yytext()); }
+^ h { add(""); }
+
+
+// u/v rules are taken from MpdlNormalizerLexLA.lex
+
+// 1. rules for u --> v
+
+^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); }
+
+^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
+
+
+[uU] / {Vowel} {
+		switch(cv) {
+			case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+			default: cv = VOWEL; add(yytext()); break;
+		}
+	}
+
+// 2. rules for v --> u
+
+qv { cv = CONS; add("qu"); }  // the replaced v still counts as consonant
+Qv { cv = CONS; add("Qu"); }
+QV { cv = CONS; add("QU"); }
+
+{LR} [vV] {
+		switch(cv) {
+			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+			default: cv = CONS; add(yytext()); break;
+		}
+	}
+
+v / {X} {Cons} { cv = CONS; add("u"); }
+V / {X} {Cons} { cv = CONS; add("U"); }
+
+// 3. override default rule for .
+
+{Vowel} { cv = VOWEL; add(yytext()); }
+{Cons} { cv = CONS; add(yytext()); }
+{hyphen} { add(yytext()); }
+@ { problem = 1; add(yytext()); }
+. { cv = 0; add(yytext()); }
+
+}
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+IT: all these rules are taken from Arboreal; do we need them all?
+IT: richtig? vollständig?
+IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse?
+IT: Änderungen in den lateinischen u/v-Regeln übernehmen?
+IT: italienische Beispielwörter für die u/v-Regeln angeben
+IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich?
+IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe?
+IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,990 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+
+/*
+ * Normalization rules for Latin text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:04 from the specification file
+ * <tt>MpdlNormalizerLexLA.lex</tt>
+ */
+public class MpdlNormalizerLexLA {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int RENAISSANCE_DICT = 10;
+  public static final int RENAISSANCE_DISP = 8;
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int RENAISSANCE_SEARCH = 12;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  2,  3,  4,  1,  2,  1,  2,  3,  4,  1, 2
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\5\42\0\1\4\23\0\1\1\3\2\1\1\2\2\1\52"+
+    "\1\1\1\0\1\2\1\3\2\2\1\1\1\2\1\45\1\3\2\2"+
+    "\1\63\1\64\2\2\1\0\1\2\6\0\1\56\1\2\1\46\1\42"+
+    "\1\10\2\2\1\50\1\13\1\26\1\2\1\47\1\37\1\12\1\60"+
+    "\1\16\1\6\1\15\1\31\1\14\1\7\1\11\2\2\1\0\1\2"+
+    "\62\0\1\4\30\0\1\24\30\0\1\22\1\36\1\30\1\54\3\0"+
+    "\1\23\1\0\1\40\1\32\1\0\1\57\1\44\1\33\1\51\1\61"+
+    "\2\0\1\41\1\34\1\53\4\0\1\43\1\35\1\55\1\62\34\0"+
+    "\1\23\71\0\1\25\53\0\1\17\u0181\0\1\27\ud4fe\0\1\20\u0590\0"+
+    "\1\21\u226e\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\10\0\1\1\1\2\2\3\1\4\1\5\1\3\1\2"+
+    "\1\3\1\2\1\6\1\1\1\7\1\10\1\11\1\12"+
+    "\11\1\1\3\2\1\3\2\2\3\2\2\1\3\1\6"+
+    "\3\3\1\1\1\2\1\13\4\0\1\14\1\15\1\16"+
+    "\1\0\1\17\1\20\1\21\1\22\1\0\1\23\20\0"+
+    "\1\24\3\0\1\25\3\0\1\26\1\0\1\27\3\0"+
+    "\1\30\1\31\1\32\1\0\1\33\1\34\2\0\1\35"+
+    "\16\0\1\36\1\0\1\37\1\0\1\40\1\0\1\41"+
+    "\1\42\1\43\1\44\1\0\1\45\1\0\1\46\1\0"+
+    "\1\47\1\0\1\50\3\0\1\51\10\0\1\52\6\0"+
+    "\1\53\1\51\1\54\1\55\1\56\1\57\5\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[166];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\65\0\152\0\237\0\324\0\u0109\0\u013e\0\u0173"+
+    "\0\u01a8\0\u01a8\0\u01a8\0\u01dd\0\u01a8\0\u01a8\0\u0212\0\u0247"+
+    "\0\u027c\0\u02b1\0\u01a8\0\u0173\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
+    "\0\u02e6\0\u031b\0\u0350\0\u0385\0\u03ba\0\u03ef\0\u0424\0\u0459"+
+    "\0\u048e\0\u04c3\0\u04f8\0\u052d\0\u0562\0\u0597\0\u05cc\0\u0601"+
+    "\0\u0636\0\u066b\0\u06a0\0\u06d5\0\u070a\0\u073f\0\u0774\0\u07a9"+
+    "\0\u07de\0\u0813\0\u01a8\0\u0848\0\u087d\0\u08b2\0\u01a8\0\u01a8"+
+    "\0\u01a8\0\u01a8\0\u08e7\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u091c"+
+    "\0\u01a8\0\u0951\0\u0986\0\u09bb\0\u09f0\0\u0a25\0\u0a5a\0\u0a8f"+
+    "\0\u0ac4\0\u0af9\0\u0b2e\0\u0b63\0\u0b98\0\u0bcd\0\u0c02\0\u0c37"+
+    "\0\u0c6c\0\u01a8\0\u0ca1\0\u0cd6\0\u0d0b\0\u01a8\0\u0d40\0\u0d75"+
+    "\0\u0daa\0\u01a8\0\u0ddf\0\u01a8\0\u0e14\0\u0e49\0\u0e7e\0\u01a8"+
+    "\0\u01a8\0\u01a8\0\u0eb3\0\u01a8\0\u01a8\0\u0ee8\0\u0f1d\0\u01a8"+
+    "\0\u0f52\0\u0f87\0\u0fbc\0\u0ff1\0\u1026\0\u105b\0\u1090\0\u10c5"+
+    "\0\u10fa\0\u112f\0\u1164\0\u1199\0\u11ce\0\u07de\0\u01a8\0\u1203"+
+    "\0\u01a8\0\u1238\0\u01a8\0\u126d\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
+    "\0\u12a2\0\u01a8\0\u12d7\0\u01a8\0\u130c\0\u01a8\0\u1341\0\u01a8"+
+    "\0\u1376\0\u13ab\0\u06d5\0\u13e0\0\u1415\0\u144a\0\u147f\0\u14b4"+
+    "\0\u14e9\0\u01a8\0\u151e\0\u1553\0\u01a8\0\u1588\0\u15bd\0\u15f2"+
+    "\0\u1627\0\u165c\0\u1691\0\u01a8\0\u01a8\0\u01a8\0\u01a8\0\u01a8"+
+    "\0\u01a8\0\u16c6\0\u16fb\0\u1730\0\u1765\0\u179a";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[166];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\20\0\1\10\44\0\1\11\1\12\1\13\1\14\1\15"+
+    "\1\16\1\17\1\20\1\12\1\21\1\13\1\22\1\13"+
+    "\1\14\1\13\1\23\1\24\1\11\1\25\1\26\1\27"+
+    "\1\30\2\11\1\31\1\13\1\32\1\33\1\34\1\35"+
+    "\1\36\1\13\1\37\1\40\1\13\1\41\1\11\1\42"+
+    "\1\13\1\14\1\13\1\11\1\13\1\11\1\43\1\44"+
+    "\1\45\1\11\1\46\2\11\1\47\1\50\1\11\1\12"+
+    "\1\13\1\14\1\15\1\16\1\51\1\52\1\12\1\21"+
+    "\1\13\1\53\1\13\1\14\1\54\1\55\1\24\1\11"+
+    "\1\25\1\26\1\27\1\30\2\11\1\31\1\13\1\32"+
+    "\1\33\1\34\1\35\1\36\1\13\1\37\1\40\1\13"+
+    "\1\41\1\11\1\56\1\13\1\14\1\57\1\11\1\60"+
+    "\1\61\1\43\1\44\1\45\1\11\1\46\2\11\1\62"+
+    "\1\50\1\11\1\12\1\13\1\14\1\15\1\63\1\17"+
+    "\1\20\1\12\1\21\1\13\1\22\1\13\1\14\1\13"+
+    "\1\23\1\24\1\11\1\25\1\26\1\27\1\30\2\11"+
+    "\1\31\1\13\1\32\1\33\1\34\1\35\1\36\1\13"+
+    "\1\37\1\40\1\13\1\41\1\11\1\42\1\13\1\14"+
+    "\1\13\1\11\1\13\1\11\1\43\1\44\1\45\1\11"+
+    "\1\46\2\11\1\47\1\50\1\11\1\12\1\13\1\14"+
+    "\1\15\1\63\1\51\1\52\1\12\1\21\1\13\1\53"+
+    "\1\13\1\14\1\54\1\55\1\24\1\11\1\25\1\26"+
+    "\1\27\1\30\2\11\1\31\1\13\1\32\1\33\1\34"+
+    "\1\35\1\36\1\13\1\37\1\40\1\13\1\41\1\11"+
+    "\1\56\1\13\1\14\1\57\1\11\1\60\1\61\1\43"+
+    "\1\44\1\45\1\11\1\46\2\11\1\62\1\50\13\0"+
+    "\1\64\2\0\1\65\1\66\51\0\1\67\101\0\1\70"+
+    "\141\0\1\71\52\0\1\71\11\0\1\72\15\0\1\73"+
+    "\36\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+
+    "\1\0\1\74\1\75\1\76\1\74\3\0\2\77\1\100"+
+    "\1\0\1\77\2\0\2\77\1\0\4\77\2\0\1\77"+
+    "\6\0\1\77\5\0\1\77\2\0\1\77\2\0\4\77"+
+    "\1\0\1\77\11\0\1\77\26\0\1\101\44\0\1\102"+
+    "\2\0\2\103\1\0\2\104\13\0\1\104\5\0\1\104"+
+    "\33\0\1\105\2\0\2\106\1\0\2\107\13\0\1\107"+
+    "\5\0\1\107\33\0\1\110\2\0\2\111\1\0\2\112"+
+    "\13\0\1\112\5\0\1\112\33\0\1\113\2\0\2\114"+
+    "\1\0\2\115\13\0\1\115\5\0\1\115\33\0\1\116"+
+    "\1\0\1\117\2\120\1\0\2\121\13\0\1\121\5\0"+
+    "\1\121\32\0\1\122\1\102\22\0\1\123\5\0\1\124"+
+    "\6\0\1\125\23\0\1\126\1\105\5\0\1\127\1\130"+
+    "\13\0\1\131\40\0\1\132\1\113\33\0\1\133\27\0"+
+    "\1\134\23\0\1\135\5\0\1\136\7\0\1\137\26\0"+
+    "\1\140\52\0\1\141\5\0\1\122\1\102\6\0\1\142"+
+    "\100\0\1\143\112\0\1\26\64\0\1\30\1\0\1\144"+
+    "\4\0\1\74\5\0\2\74\2\0\1\74\42\0\1\74"+
+    "\1\0\1\74\2\0\1\74\3\0\2\145\1\146\1\0"+
+    "\1\145\2\0\2\145\1\0\4\145\2\0\1\145\6\0"+
+    "\1\145\5\0\1\145\2\0\1\145\2\0\4\145\1\0"+
+    "\1\145\11\0\1\145\7\0\1\147\1\0\1\72\15\0"+
+    "\1\73\36\0\1\150\5\0\2\150\2\0\1\150\42\0"+
+    "\1\150\1\0\1\150\1\75\1\76\1\150\13\0\1\151"+
+    "\13\0\1\101\46\0\1\152\63\0\1\153\1\152\63\0"+
+    "\1\154\1\0\1\140\52\0\1\141\51\0\1\155\64\0"+
+    "\1\156\20\0\1\132\60\0\1\150\5\0\2\150\2\0"+
+    "\1\150\42\0\1\150\1\0\1\150\2\0\1\150\13\0"+
+    "\1\157\62\0\1\160\63\0\1\161\1\160\63\0\1\162"+
+    "\57\0\2\77\2\0\1\77\2\0\2\77\1\0\4\77"+
+    "\2\0\1\77\6\0\1\77\5\0\1\77\2\0\1\77"+
+    "\2\0\4\77\1\0\1\77\11\0\1\77\7\0\1\103"+
+    "\65\0\1\163\62\0\1\102\2\0\2\103\61\0\1\106"+
+    "\65\0\1\164\62\0\1\105\2\0\2\106\61\0\1\111"+
+    "\65\0\1\165\62\0\1\110\2\0\2\111\61\0\1\114"+
+    "\65\0\1\166\62\0\1\113\2\0\2\114\61\0\1\120"+
+    "\62\0\1\167\67\0\1\170\62\0\1\116\2\0\2\120"+
+    "\57\0\1\171\1\172\63\0\1\173\1\174\63\0\1\175"+
+    "\64\0\1\176\64\0\1\177\64\0\1\200\1\201\63\0"+
+    "\1\202\1\203\63\0\1\204\1\205\63\0\1\206\1\207"+
+    "\63\0\1\210\64\0\1\204\61\0\2\145\2\0\1\145"+
+    "\2\0\2\145\1\0\4\145\2\0\1\145\6\0\1\145"+
+    "\5\0\1\145\2\0\1\145\2\0\4\145\1\0\1\145"+
+    "\11\0\1\145\44\0\1\211\24\0\1\212\7\0\1\213"+
+    "\65\0\1\214\53\0\1\215\11\0\1\213\112\0\1\216"+
+    "\66\0\1\217\64\0\1\220\22\0\1\221\7\0\1\65"+
+    "\65\0\1\222\53\0\1\223\11\0\1\65\56\0\1\224"+
+    "\61\0\1\122\64\0\1\126\64\0\1\225\64\0\1\134"+
+    "\66\0\1\226\64\0\1\227\64\0\1\230\64\0\1\231"+
+    "\64\0\1\232\64\0\1\233\62\0\1\234\73\0\1\213"+
+    "\54\0\1\235\76\0\1\213\53\0\1\236\64\0\1\237"+
+    "\64\0\1\240\73\0\1\65\66\0\1\65\53\0\1\241"+
+    "\67\0\1\242\64\0\1\243\64\0\1\244\64\0\1\245"+
+    "\64\0\1\143\64\0\1\246\61\0\1\171\64\0\1\173"+
+    "\64\0\1\200\64\0\1\202\64\0\1\206\57\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[6095];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\6\0\1\1\1\0\3\11\1\1\2\11\4\1\1\11"+
+    "\1\1\4\11\32\1\1\11\3\0\4\11\1\0\4\11"+
+    "\1\0\1\11\20\0\1\11\3\0\1\11\3\0\1\11"+
+    "\1\0\1\11\3\0\3\11\1\0\2\11\2\0\1\11"+
+    "\16\0\1\11\1\0\1\11\1\0\1\11\1\0\4\11"+
+    "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
+    "\3\0\1\1\5\0\1\11\2\0\1\11\6\0\6\11"+
+    "\5\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[166];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /** For the backwards DFA of general lookahead statements */
+  private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1];
+
+  /* user code: */
+	private static final int CONS = 1;
+	private static final int VOWEL = 2;
+	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexLA(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexLA(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 184) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      if (zzMarkedPosL > zzStartRead) {
+        switch (zzBufferL[zzMarkedPosL-1]) {
+        case '\n':
+        case '\u000B':
+        case '\u000C':
+        case '\u0085':
+        case '\u2028':
+        case '\u2029':
+          zzAtBOL = true;
+          break;
+        case '\r':
+          if (zzMarkedPosL < zzEndReadL)
+            zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          else if (zzAtEOF)
+            zzAtBOL = false;
+          else {
+            boolean eof = zzRefill();
+            zzMarkedPosL = zzMarkedPos;
+            zzEndReadL = zzEndRead;
+            zzBufferL = zzBuffer;
+            if (eof)
+              zzAtBOL = false;
+            else
+              zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          }
+          break;
+        default:
+          zzAtBOL = false;
+        }
+      }
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      if (zzAtBOL)
+        zzState = ZZ_LEXSTATE[zzLexicalState+1];
+      else
+        zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 39:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("um");
+          }
+        case 48: break;
+        case 28:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = CONS; add("U");
+          }
+        case 49: break;
+        case 4:
+          { add(yytext());
+          }
+        case 50: break;
+        case 46:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 3;
+          { add("Hic");
+          }
+        case 51: break;
+        case 9:
+          { cv = VOWEL; add("AE");
+          }
+        case 52: break;
+        case 1:
+          { problem = 1; cv = 0; add(yytext());
+          }
+        case 53: break;
+        case 5:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 54: break;
+        case 18:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = CONS; add("u");
+          }
+        case 55: break;
+        case 21:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("e");
+          }
+        case 56: break;
+        case 29:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
+          }
+        case 57: break;
+        case 34:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("et");
+          }
+        case 58: break;
+        case 41:
+          // general lookahead, find correct zzMarkedPos
+          { int zzFState = 5;
+            int zzFPos = zzStartRead;
+            if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; }
+            boolean zzFinL[] = zzFin;
+            while (zzFState != -1 && zzFPos < zzMarkedPos) {
+              if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+              zzInput = zzBufferL[zzFPos++];
+              zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+            }
+            if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+
+            zzFState = 6;
+            zzFPos = zzMarkedPos;
+            while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) {
+              zzInput = zzBufferL[--zzFPos];
+              zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+            };
+            zzMarkedPos = zzFPos;
+          }
+          { cv = VOWEL; add(yytext().replace("ſ", "s"));
+          }
+        case 59: break;
+        case 3:
+          { cv = CONS; add(yytext());
+          }
+        case 60: break;
+        case 27:
+          { cv = VOWEL; add("oi");
+          }
+        case 61: break;
+        case 25:
+          { cv = CONS; add("QU");
+          }
+        case 62: break;
+        case 15:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { switch(cv) {
+			case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+			default: cv = VOWEL; add(yytext()); break;
+		}
+          }
+        case 63: break;
+        case 7:
+          { cv = CONS; add("ss");
+          }
+        case 64: break;
+        case 6:
+          { cv = CONS; add("s");
+          }
+        case 65: break;
+        case 22:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("o");
+          }
+        case 66: break;
+        case 33:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("ac");
+          }
+        case 67: break;
+        case 2:
+          { cv = VOWEL; add(yytext());
+          }
+        case 68: break;
+        case 43:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 3;
+          { add("qui");
+          }
+        case 69: break;
+        case 35:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("er");
+          }
+        case 70: break;
+        case 24:
+          { cv = CONS; add("Qu");
+          }
+        case 71: break;
+        case 30:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("ve");
+          }
+        case 72: break;
+        case 38:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("us");
+          }
+        case 73: break;
+        case 32:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("am");
+          }
+        case 74: break;
+        case 8:
+          { cv = VOWEL; add("ae");
+          }
+        case 75: break;
+        case 11:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 76: break;
+        case 26:
+          { add("ar");
+          }
+        case 77: break;
+        case 45:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 3;
+          { add("hic");
+          }
+        case 78: break;
+        case 17:
+          { cv = VOWEL; add("uu");
+          }
+        case 79: break;
+        case 40:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("ul");
+          }
+        case 80: break;
+        case 20:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("a");
+          }
+        case 81: break;
+        case 10:
+          { cv = VOWEL; add("oe");
+          }
+        case 82: break;
+        case 16:
+          { cv = VOWEL; add("ui");
+          }
+        case 83: break;
+        case 14:
+          { cv = CONS; add("qu");
+          }
+        case 84: break;
+        case 47:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 4;
+          { add("que");
+          }
+        case 85: break;
+        case 23:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("u");
+          }
+        case 86: break;
+        case 36:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("es");
+          }
+        case 87: break;
+        case 44:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 3;
+          { add("Qui");
+          }
+        case 88: break;
+        case 42:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { add("i");
+          }
+        case 89: break;
+        case 12:
+          { add("X");
+          }
+        case 90: break;
+        case 13:
+          { switch(cv) {
+			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+			default: cv = CONS; add(yytext()); break;
+		}
+          }
+        case 91: break;
+        case 19:
+          { cv = VOWEL; add("ii");
+          }
+        case 92: break;
+        case 31:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("as");
+          }
+        case 93: break;
+        case 37:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { add("od");
+          }
+        case 94: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,214 @@
+/*
+ * Normalization rules for Latin text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexLA
+%type java.lang.String
+%unicode
+
+// Latin: la, lat
+
+%states DISP, DICT, SEARCH
+%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH
+
+%{
+	private static final int CONS = 1;
+	private static final int VOWEL = 2;
+	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+Vowel = [AEIOUaeiou] // without Ææęàèòùœ
+Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR = [lLrR]
+
+hyphen = [\u002d\u00ad]  // hyphen and soft hyphen
+X = {hyphen}?
+
+END = \n
+
+que = (que)?  // optional -que
+enclitic = (que | ve | ne)
+prefixCons = (in{X}ter | per | ſu{X}per | ſer) // "ſer" for forms of ſervare
+
+%%
+
+
+// TEST, siehe Benedetti Seite 444
+𐆑 { add("X"); } // (U+10191; D800+DD91)
+
+
+<DISP, DICT, SEARCH,
+RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> {
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ { cv = CONS; add("s"); }
+ß { cv = CONS; add("ss"); }
+[æę] { cv = VOWEL; add("ae"); }
+Æ { cv = VOWEL; add("AE"); }
+œ { cv = VOWEL; add("oe"); }
+
+// 1.2 character combinations
+ij { cv = VOWEL; add("ii"); }
+
+// 2. superfluous diacritics
+
+// 2.1 acute accent
+q́ue / {END} { add("que"); }  // G
+á / [mrst]? {enclitic} {END} { add("a"); }  // G
+é / [mrst]? {enclitic} {END} { add("e"); }  // G
+í / [mrst]? {enclitic} {END} { add("i"); }  // G
+ó / [mrst]? {enclitic} {END} { add("o"); }  // G
+ú / [mrst]? {enclitic} {END} { add("u"); }  // G
+
+úe / {END} { add("ve"); }  // W ??
+
+// 2.2 grave accent
+à / {que} {END} { add("a"); }  // W G
+àm / {que} {END} { add("am"); }  // W (G)
+às / {que} {END} { add("as"); }  // W (G) (-àsque will likely never occur)
+è / {que} {END} { add("e"); }  // W G
+ò / {que} {END} { add("o"); }  // W G
+òd / {que} {END} { add("od"); }  // W (G)
+ùm / {que} {END} { add("um"); }  // W (G)
+ùs / {que} {END} { add("us"); }  // W G
+
+ès / {que} {END} { add("es"); }  // (G)
+^ quì / {END} { add("qui"); }  // W ??
+^ Quì / {END} { add("Qui"); }  // W ??
+àc / {END} { add("ac"); }  // W ??
+èr / {END} { add("er"); }  // W ??
+èt / {END} { add("et"); }  // W ??
+ù / {END} { add("u"); }  // W ??
+ùl / {END} { add("ul"); }  // W ??
+
+// 2.3 circumflex accent
+^ hîc / {END} { add("hic"); }  // W G
+^ Hîc / {END} { add("Hic"); }  // W G
+^ ô / {END} { add("o"); }  // G
+â / {que} {END} { add("a"); }  // W G
+ûs / {END} { add("us"); }  // W G
+âr { add("ar"); }  // W (G) --> this is only a rough approximation!
+
+// 2.4 trema
+// 2.4.1 common cases
+aë { cv = VOWEL; add("ae"); }
+oë { cv = VOWEL; add("oe"); }
+// 2.4.2 rare cases
+oï { cv = VOWEL; add("oi"); }
+uï { cv = VOWEL; add("ui"); }
+// 2.4.3 extremely rare cases
+uü { cv = VOWEL; add("uu"); }
+
+
+// 3. rules for u and v
+
+// 3.1 rules for u --> v
+
+// peruenias --> pervenias, interuallum --> intervallum
+^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); }  // not cv = CONS !
+
+// uellet --> vellet
+^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
+
+// diuidatur --> dividatur
+// ut, volui: unchanged
+// no rule for veruina because we cannot distinguish it from volui
+[uU] / {Vowel} {
+		switch(cv) {
+			case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+			default: cv = VOWEL; add(yytext()); break;
+		}
+	}
+
+// 3.2 rules for v --> u
+
+// qvam --> quam
+qv { cv = CONS; add("qu"); }  // the replaced v still counts as consonant
+Qv { cv = CONS; add("Qu"); }
+QV { cv = CONS; add("QU"); }
+
+// febrvarius --> februarius
+// curva: unchanged
+{LR} [vV] {
+		switch(cv) {
+			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+			default: cv = CONS; add(yytext()); break;
+		}
+	}
+
+// februarivs --> februarius
+v / {X} {Cons} { cv = CONS; add("u"); }
+V / {X} {Cons} { cv = CONS; add("U"); }
+
+// 3.3 override default rule for .
+
+{Vowel} { cv = VOWEL; add(yytext()); }
+{Cons} { cv = CONS; add(yytext()); }
+{hyphen} { add(yytext()); }
+
+. { problem = 1; cv = 0; add(yytext()); }  // in particular "@", and from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
+
+}
+
+
+<DISP, SEARCH,
+RENAISSANCE_DISP, RENAISSANCE_SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT,
+RENAISSANCE_DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+
+TO DO:
+
+LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ?
+LA: Diakritika nochmal mit Paul durchgehen
+LA: Die Disambiguierungen durch die Diakritika fehlen noch.
+LA: ist J wirklich ein Problemfall?
+LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,576 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+
+/*
+ * Normalization rules for Dutch text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:04 from the specification file
+ * <tt>MpdlNormalizerLexNL.lex</tt>
+ */
+public class MpdlNormalizerLexNL {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  1, 1
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\1\65\0\1\3\u013e\0\1\2\ufe80\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\3\0\1\1\1\2\1\3\1\4\1\5";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\4\0\10\0\14\0\14\0\14\0\14\0\14";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\4\1\0\1\4\1\5\1\4\1\6\1\7\1\5"+
+    "\1\4\1\10\1\7\1\5\4\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[16];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\3\0\5\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[8];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexNL(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexNL(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 14) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 5:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 6: break;
+        case 2:
+          { problem = 1; add(yytext());
+          }
+        case 7: break;
+        case 4:
+          { add("s");
+          }
+        case 8: break;
+        case 3:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 9: break;
+        case 1:
+          { add(yytext());
+          }
+        case 10: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexNL.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,83 @@
+/*
+ * Normalization rules for Dutch text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexNL
+%type java.lang.String
+%unicode
+
+// Dutch: nl
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+END = \n
+
+%%
+
+<DISP, DICT, SEARCH> {
+
+ſ { add("s"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+NL: vollständig?
+
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.java	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,638 @@
+/* The following code was generated by JFlex 1.4.3 on 22.02.11 12:04 */
+
+/*
+ * Normalization rules for Chinese text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 22.02.11 12:04 from the specification file
+ * <tt>MpdlNormalizerLexZH.lex</tt>
+ */
+public class MpdlNormalizerLexZH {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int SEARCH = 6;
+  public static final int DICT = 4;
+  public static final int YYINITIAL = 0;
+  public static final int DISP = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  1,  2,  2,  3, 3
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\2\45\0\1\1\1\0\1\1\15\0\1\20\41\0\1\1"+
+    "\22\0\1\1\5\0\1\1\1\0\1\1\u4f84\0\1\3\176\0\1\4"+
+    "\u035a\0\1\4\u0a9a\0\1\6\u0781\0\1\10\u057a\0\1\11\u06bd\0\1\12"+
+    "\15\0\1\7\u0891\0\1\5\u1baf\0\1\13\340\0\1\14\u411a\0\1\16"+
+    "\u040e\0\1\17\u1d8f\0\1\15\u05e2\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\1"+
+    "\1\17\1\20\1\21";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[22];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\21\0\42\0\63\0\104\0\104\0\104\0\104"+
+    "\0\104\0\104\0\104\0\104\0\104\0\104\0\104\0\104"+
+    "\0\104\0\104\0\125\0\104\0\104\0\104";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[22];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\2\5\1\0\15\5\1\6\2\5\1\7\1\10\1\11"+
+    "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+
+    "\1\22\1\23\1\5\1\6\1\5\1\24\1\25\1\10"+
+    "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+    "\1\21\1\22\1\23\1\5\1\6\1\5\1\24\1\7"+
+    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+
+    "\1\20\1\21\1\22\1\23\1\5\1\6\40\0\1\26"+
+    "\1\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[102];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\4\0\16\11\1\1\3\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[22];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexZH(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexZH(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 90) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 17:
+          { add("庶");
+          }
+        case 18: break;
+        case 9:
+          { add("時");
+          }
+        case 19: break;
+        case 2:
+          { problem = 1; add(yytext());
+          }
+        case 20: break;
+        case 3:
+          { switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+          }
+        case 21: break;
+        case 10:
+          { add("歷");
+          }
+        case 22: break;
+        case 13:
+          { add("面");
+          }
+        case 23: break;
+        case 14:
+          { add("精");
+          }
+        case 24: break;
+        case 12:
+          { add("陰");
+          }
+        case 25: break;
+        case 8:
+          { add("床");
+          }
+        case 26: break;
+        case 1:
+          { add(yytext());
+          }
+        case 27: break;
+        case 15:
+          { add("");
+          }
+        case 28: break;
+        case 7:
+          { add("并");
+          }
+        case 29: break;
+        case 4:
+          { add("併");
+          }
+        case 30: break;
+        case 11:
+          { add("為");
+          }
+        case 31: break;
+        case 6:
+          { add("奇");
+          }
+        case 32: break;
+        case 5:
+          { add("叟");
+          }
+        case 33: break;
+        case 16:
+          { switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+          }
+        case 34: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexZH.lex	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,119 @@
+/*
+ * Normalization rules for Chinese text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 0.96
+ * 2011-02-21
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexZH
+%type java.lang.String
+%unicode
+
+// classical Chinese: zh, zho, zho-Hant
+
+%states DISP, DICT, SEARCH
+
+%{
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+
+	private void add (String norm) {
+		original += yytext();
+		normalized += norm;
+	}
+%}
+
+ZWS = [\u{200b}]
+
+END = \n
+
+%%
+
+// Normalization in Chinese means that character variants will be replaced by their standard characters
+// if there is no doubt about what the standard character is.
+
+// The input is supposed to be a single Chinese character, but strings of characters are also handled correctly.
+
+<DISP, DICT, SEARCH> {
+
+// Codepoint < FFFF
+
+倂 { add("併"); }  // 5002 --> 4F75
+傁 | 叜 { add("叟"); }  // 5081, 53DC --> 53DF
+竒 { add("奇"); }  // 7AD2 --> 5947
+幷 { add("并"); }  // 5E77 --> 5E76
+牀 { add("床"); }  // 7240 --> 5E8A
+旹 { add("時"); }  // 65F9 --> 6642
+歴 { add("歷"); }  // 6B74 --> 6B77
+爲 { add("為"); }  // 7232 --> 70BA
+隂 { add("陰"); }  // 9682 --> 9670
+靣 { add("面"); }  // 9763 --> 9762
+精 { add("精"); }  // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+
+// Codepoint > FFFF
+
+// note that [ABC] is not equivalent to A | B | C  for codepoints above FFFF due to their internal encoding:
+// for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D
+// i.e. never use [ABC] but A | B | C
+
+庶 { add("庶"); }  // 2F88D --> 5EB6  (2F88D is a compatibility ideograph)
+
+}
+
+<DICT, SEARCH> {
+
+// remove Zero Width Space (if there is any in the the input string)
+
+{ZWS} { add(""); }
+
+}
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+<DISP, SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized;
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
+
+TO DO:
+
+ZH: Liste ergänzen
+ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren.
+ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist?
+
+*/
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Tue Feb 22 16:03:45 2011 +0100
@@ -45,6 +45,48 @@
       return false;
   }

+  public boolean isGerman(String language) {
+    if (getLanguageId(language).equals("de"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isFrench(String language) {
+    if (getLanguageId(language).equals("fr"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isEnglish(String language) {
+    if (getLanguageId(language).equals("en"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isDutch(String language) {
+    if (getLanguageId(language).equals("nl"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isGreek(String language) {
+    if (getLanguageId(language).equals("el"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isArabic(String language) {
+    if (getLanguageId(language).equals("ar"))
+      return true;
+    else
+      return false;
+  }
+
   public boolean isItalian(String language) {
     if (getLanguageId(language).equals("it"))
       return true;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/Util.java	Tue Feb 22 16:03:45 2011 +0100
@@ -19,6 +19,10 @@
     return props;
   }

+  public String test(String inputStr) {
+    return "BlaBla";
+  }
+
   public String toYearStr(String inputStr) {
     String retYearStr = inputStr.trim();
     int index = inputStr.indexOf("-");
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Tue Feb 22 16:03:45 2011 +0100
@@ -12,6 +12,7 @@
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
+import java.util.Iterator;

 import javax.xml.XMLConstants;
 import javax.xml.namespace.NamespaceContext;
@@ -34,6 +35,8 @@
 import javax.xml.xpath.XPathFactory;

 import net.sf.saxon.om.NodeInfo;
+import net.sf.saxon.query.QueryResult;
+import net.sf.saxon.trans.XPathException;

 import org.w3c.dom.Document;
 import org.w3c.dom.DocumentType;
@@ -50,10 +53,52 @@
   static String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource";
   static String W3C_XML_SCHEMA = XMLConstants.W3C_XML_SCHEMA_NS_URI;

+  private NamespaceContext namespaceContext;
+
   public static XmlUtil getInstance() {
     return new XmlUtil();
   }

+  public void setNsContext(String nsName) {
+    if (nsName.equals("general"))
+      namespaceContext = getNsContextGeneral();
+  }
+
+  public NamespaceContext getNsContextGeneral() {
+    NamespaceContext nsContext = new NamespaceContext() {
+      public String getNamespaceURI(String prefix) {
+        String uri;
+        if (prefix.equals("xlink"))
+          uri = "http://www.w3.org/1999/xlink";
+        else if (prefix.equals("xml"))
+          uri = "http://www.w3.org/XML/1998/namespace";
+        else if (prefix.equals("dc"))
+          uri = "http://purl.org/dc/elements/1.1/";
+        else if (prefix.equals("mpiwg"))
+          uri = "http://www.mpiwg-berlin.mpg.de/ns/mpiwg";
+        else
+          uri = null;
+        return uri;
+      }
+      public String getPrefix(String uri) {
+        if (uri.equals("http://www.w3.org/1999/xlink"))
+          return "xlink";
+        else if (uri.equals("http://www.w3.org/XML/1998/namespace"))
+          return "xml";
+        else if (uri.equals("http://purl.org/dc/elements/1.1/"))
+          return "dc";
+        else if (uri.equals("http://www.mpiwg-berlin.mpg.de/ns/mpiwg"))
+          return "mpiwg";
+        else
+          return null;
+      }
+      public Iterator getPrefixes(String namespace) {
+        return null;
+      }
+    };
+    return nsContext;
+  }
+
   public Node doc(String url) throws ApplicationException {
     Node root = null;
     try {
@@ -205,6 +250,8 @@
     ArrayList<String> retStrArray = null;
     try {
       XPath xpath = XPathFactory.newInstance().newXPath();
+      if (namespaceContext != null)
+        xpath.setNamespaceContext(namespaceContext);
       if (nsContext != null)
         xpath.setNamespaceContext(nsContext);
       Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET);
@@ -221,6 +268,8 @@
     ArrayList<Node> retArray = null;
     try {
       XPath xpath = XPathFactory.newInstance().newXPath();
+      if (namespaceContext != null)
+        xpath.setNamespaceContext(namespaceContext);
       if (nsContext != null)
         xpath.setNamespaceContext(nsContext);
       Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET);
@@ -297,7 +346,7 @@
    * javax XPath evaluation: returns a NodeList
    * Saxon's XPath evaluation: returns an ArrayList of TinyTextImpl (which could be casted to NodeInfo which could be handled as if it was a dom node)
    */
-  private ArrayList<Node> nodesetToNodeArray(Object nodesetObjects) {
+  private ArrayList<Node> nodesetToNodeArray(Object nodesetObjects) throws ApplicationException {
     ArrayList<Node> retArray = null;
     if (nodesetObjects instanceof NodeList) {
       NodeList resultNodeList = (NodeList) nodesetObjects;
@@ -319,7 +368,16 @@
           retArray.add(n);
         } else if (arrayListNode instanceof NodeInfo) {
           NodeInfo n = (NodeInfo) arrayListNode;
-          // TODO provide clean return value
+          String xmlStr = "";
+          try {
+            xmlStr = QueryResult.serialize(n);
+            DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
+            DocumentBuilder docBuilder = dbfac.newDocumentBuilder();
+            Node domNode = docBuilder.parse(new InputSource(new StringReader(xmlStr))).getDocumentElement();
+            retArray.add(domNode);
+          } catch (Exception e) {
+            throw new ApplicationException(e);
+          }
         }
       }
     }
@@ -353,11 +411,14 @@
             serializeNode(n, writer, "");
           } else if (arrayListNode instanceof NodeInfo) {
             NodeInfo n = (NodeInfo) arrayListNode;
-            writer.write(n.getStringValue());  // TODO if that really happens
+            String xmlStr = QueryResult.serialize(n);
+            writer.write(xmlStr);
           }
         }
       }
       writer.flush();
+    } catch (XPathException e) {
+      throw new ApplicationException(e);
     } catch (IOException e) {
       throw new ApplicationException(e);
     }
@@ -367,7 +428,10 @@
   public String evaluateToXmlString(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException {
     String resultStr = null;
     try {
-      XPath xpath = XPathFactory.newInstance().newXPath();
+      XPathFactory xpathFactory = net.sf.saxon.xpath.XPathFactoryImpl.newInstance();
+      XPath xpath = xpathFactory.newXPath();
+      if (namespaceContext != null)
+        xpath.setNamespaceContext(namespaceContext);
       if (nsContext != null)
         xpath.setNamespaceContext(nsContext);
       Reader stringReader = new StringReader(xmlString);
@@ -382,6 +446,70 @@
     return resultStr;
   }

+  public ArrayList<Node> evaluateToNodeArray(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException {
+    ArrayList<Node> result = null;
+    try {
+      XPathFactory xpathFactory = net.sf.saxon.xpath.XPathFactoryImpl.newInstance();
+      XPath xpath = xpathFactory.newXPath();
+      if (namespaceContext != null)
+        xpath.setNamespaceContext(namespaceContext);
+      if (nsContext != null)
+        xpath.setNamespaceContext(nsContext);
+      Reader stringReader = new StringReader(xmlString);
+      InputSource inputSource = new InputSource(stringReader);
+      Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET);
+      if (resultObjects != null) {
+        result = nodesetToNodeArray(resultObjects);
+      }
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+    return result;
+  }
+
+  public String insertAtCharPos(String xmlFragment, String charPosStr, String newXmlNodeStr) {
+    Integer charPos = new Integer(charPosStr);
+    int strCharIndex = getCharIndex(xmlFragment, charPos);
+    if (charPos == 0)
+      strCharIndex = getCharIndex(xmlFragment, charPos + 1) - 1;
+    String resultStr = xmlFragment.substring(0, strCharIndex) + newXmlNodeStr + xmlFragment.substring(strCharIndex);
+    return resultStr;
+  }
+
+  private int getCharIndex(String xmlFragment, int charPos) {
+    int size = xmlFragment.length();
+    int counter = 0;
+    int charCounter = 0;
+    int counterLastChar = -1;
+    boolean isEntity = false;
+    boolean isElement = false;
+    while (counter < size) {
+      char c = xmlFragment.charAt(counter);
+      switch (c) {
+        case '<': isElement = true; break;
+        case '>': isElement = false; break;
+        case '&': isEntity = true; break;
+        case ';': isEntity = false; break;
+      }
+      // count all chars which are not inside elements and entities
+      // if element closing char ">" is found it should not be counted as a char
+      // if an entity closing char ";" is found it should be counted cause the entity itself is one char long
+      if (! isEntity && ! isElement && !(c == '>')) {
+        charCounter++;
+        counterLastChar = counter;
+      }
+      if (charCounter == charPos) {
+        break;
+      }
+      counter++;
+    }
+    // input charPos was bigger than available chars: return the last available charPos
+    if (counter == size)
+      return counterLastChar + 1;
+    return counter + 1;
+  }
+
+
   /**
    * <p> This will serialize a DOM <code>Node</code> to
    *   the supplied <code>Writer</code>. </p>
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java	Tue Feb 22 16:03:45 2011 +0100
@@ -23,7 +23,6 @@
 package org.exist.xquery.modules.mpdltext;

 import java.util.ArrayList;
-import java.util.Date;

 import org.exist.dom.QName;
 import org.exist.xquery.BasicFunction;
@@ -39,6 +38,8 @@

 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
 import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtElement;
+import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtObject;
+import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtQuery;
 import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExternalObjectsHandler;

 /**
@@ -61,9 +62,9 @@

 	public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
 	  Sequence operation = args[0];  // read, update or delete
-    Sequence type = args[1];
-    Sequence object = args[2];
-		if (operation.isEmpty() || type.isEmpty())
+    Sequence type = args[1];  // element, query
+    Sequence object = args[2];  // the object as an xml string
+		if (operation.isEmpty() || type.isEmpty() || object.isEmpty())
 			return Sequence.EMPTY_SEQUENCE;
     String operationStr = operation.getStringValue();
     String typeStr = type.getStringValue();
@@ -72,17 +73,11 @@
     ValueSequence result = null;
     String resultStr = "";
     try {
+      ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance();
       if (typeStr.equals("element")) {
         ExtElement e = ExtElement.parseXmlStr(objectStr);
-        if (operation.equals("create") || operation.equals("update")) {
-          Date now = new Date();
-          e.setModificationDate(now);
-        }
-        String documentId = e.getDocumentId();
-        String pageNumber = e.getPageNumber();
         if (operationStr.equals("read")) {
-          ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance();
-          ArrayList<ExtElement> elems = externalObjectsHandler.readExternalElements(documentId, pageNumber);
+          ArrayList<ExtElement> elems = externalObjectsHandler.readExternalElements(e);
           if (elems != null && elems.size() > 0) {
             resultStr = "<result>";
             for (int i=0; i<elems.size(); i++) {
@@ -93,14 +88,32 @@
             resultStr = resultStr + "</result>";
           }
         } else if (operationStr.equals("create")) {
-          // TODO
+          externalObjectsHandler.createExternalElement(e);
         } else if (operationStr.equals("update")) {
-          // TODO
+          externalObjectsHandler.updateExternalElement(e);
         } else if (operationStr.equals("delete")) {
-          // TODO
+          externalObjectsHandler.deleteExternalElement(e);
         }
       } else if (typeStr.equals("query")) {
-          // TODO
+        ExtQuery q = ExtQuery.parseXmlStr(objectStr);
+        if (operationStr.equals("read")) {
+          ArrayList<ExtObject> objects = externalObjectsHandler.readExternalObjects(q);
+          if (objects != null && objects.size() > 0) {
+            resultStr = "<result>";
+            for (int i=0; i<objects.size(); i++) {
+              ExtObject obj = objects.get(i);
+              String elemXmlStr = obj.getXmlString();
+              resultStr = resultStr + elemXmlStr;
+            }
+            resultStr = resultStr + "</result>";
+          }
+        } else if (operationStr.equals("create")) {
+          externalObjectsHandler.createExternalObject(q);
+        } else if (operationStr.equals("update")) {
+          externalObjectsHandler.updateExternalObject(q);
+        } else if (operationStr.equals("delete")) {
+          externalObjectsHandler.deleteExternalObject(q);
+        }
       }
       result = new ValueSequence();
       result.add(new StringValue(resultStr));
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java	Tue Feb 22 16:03:45 2011 +0100
@@ -51,7 +51,6 @@
     new FunctionDef(EncodeBig5.signature, EncodeBig5.class),
     new FunctionDef(LuceneQueryParser.signature, LuceneQueryParser.class),
     new FunctionDef(ExternalObject.signature, ExternalObject.class),
-    new FunctionDef(InsertAtCharPos.signature, InsertAtCharPos.class),
     new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class)
 	};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/webapp/mpdl/_stuff/futureDev/insert.xql	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,67 @@
+xquery version "1.0";
+
+module namespace mpdl-text = "http://www.mpiwg-berlin.mpg.de/ns/mpdl/text";
+
+declare function mpdl-text:insert($fragment as element(), $externalObjects as element()*) {
+  let $firstObject := $externalObjects[1]
+  let $xmlNodeId := $firstObject/@xmlNodeId
+  let $posNode := $fragment//*[@xmlNodeId = $xmlNodeId]
+  let $before := $firstObject/@before
+  let $boolBefore :=
+    if ($before = "true")
+    then true()
+    else false()
+  let $charPosStr := $firstObject/@charPos
+  let $charPos :=
+    if($charPosStr != "" and not(empty($charPosStr)))
+    then number($charPosStr)
+    else -1
+  let $newNode := $firstObject/content
+  let $size := count($externalObjects)
+  let $otherObjects :=
+    if ($size > 1)
+    then subsequence($externalObjects, 2, $size)
+    else ()
+  let $insertedFragment := mpdl-text:insert($fragment, $posNode, $boolBefore, $charPos, $newNode)
+  let $result :=
+    if ($size >= 1)
+    then
+      mpdl-text:insert($insertedFragment, $otherObjects)
+    else
+      $fragment
+    return $result
+};
+
+declare function mpdl-text:insert($element as element(), $node, $before, $charPos, $newNode) {
+  if ($element = $node and $before and $charPos = -1)
+  then
+  ($newNode,
+  element {node-name($node)}
+    {$node/@*,
+     for $child in $node/node()
+        return if ($child instance of element())
+          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
+          else $child
+    })
+  else if ($element = $node and not($before) and $charPos = -1)
+  then
+  (element {node-name($node)}
+    {$node/@*,
+     for $child in $node/node()
+        return if ($child instance of element())
+          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
+          else $child
+    }, $newNode)
+  else if ($element = $node and $charPos >= 0)
+  then
+    util:parse(mpdltext:insertAtCharPos(util:serialize($node, ()), util:serialize($newNode, ()), $charPos))
+  else
+  element {node-name($element)}
+    {$element/@*,
+     for $child in $element/node()
+        return if ($child instance of element())
+          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
+          else $child
+    }
+};
+
--- a/software/eXist/webapp/mpdl/interface/page-fragment.xql	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/webapp/mpdl/interface/page-fragment.xql	Tue Feb 22 16:03:45 2011 +0100
@@ -110,6 +110,7 @@
   else ()
 let $pageHeader := string($pb1/@rhead)
 let $pageNumberOrig := string($pb1/@o)
+let $pageNumberOrigNorm := string($pb1/@o-norm)

 let $documentIdentifier :=
   if ($docbase = 'archimedes')
@@ -227,16 +228,16 @@
   if(not(empty($externalElements)))
   then true()
   else false()
-let $returnPageFragmentTmpp :=
+let $returnPageFragmentWithExtObjects :=
+  <result>
+    <externalElements>{$externalElements}</externalElements>
+    <fragment>{$returnPageFragmentTmp}</fragment>
+  </result>
+let $returnPageFragment :=
   if (contains($options, "withXmlNodeId") or $containsExternalElements)
-  then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1])
+  then mpdl-text:transform($returnPageFragmentWithExtObjects, concat($presentationPath, "/insertExternalElements.xsl"))
   else $returnPageFragmentTmp

-let $returnPageFragment :=
-  if($containsExternalElements)
-  then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements)
-  else $returnPageFragmentTmpp
-
 let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure']
 let $pageFigures :=
     for $pageFigureAnchor in $pageFigureAnchors
@@ -337,6 +338,7 @@
         <number>{$pn}</number>
         <header>{$pageHeader}</header>
         <number-orig>{$pageNumberOrig}</number-orig>
+        <number-orig-norm>{$pageNumberOrigNorm}</number-orig-norm>
         <sentence-number>{$sn}</sentence-number>
         <digilib-available>{$digilibAvailable}</digilib-available>
         <image-available>{$imageIsAvailable}</image-available>
--- a/software/eXist/webapp/mpdl/page-query-result.xql	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/webapp/mpdl/page-query-result.xql	Tue Feb 22 16:03:45 2011 +0100
@@ -270,16 +270,16 @@
   if(not(empty($externalElements)))
   then true()
   else false()
-let $returnPageFragmentTmpp :=
+let $returnPageFragmentWithExtObjects :=
+  <result>
+    <externalElements>{$externalElements}</externalElements>
+    <fragment>{$returnPageFragmentTmp}</fragment>
+  </result>
+let $returnPageFragment :=
   if (contains($options, "withXmlNodeId") or $containsExternalElements)
-  then mpdl-text:insertNodeIdAttribute($returnPageFragmentTmp/*[1])
+  then mpdl-text:transform($returnPageFragmentWithExtObjects, concat($presentationPath, "/insertExternalElements.xsl"))
   else $returnPageFragmentTmp

-let $returnPageFragment :=
-  if($containsExternalElements)
-  then mpdl-text:insert($returnPageFragmentTmpp/*[1], $externalElements)
-  else $returnPageFragmentTmpp
-
 let $pageFigureAnchors := $returnPageFragment//anchor[@type = 'figure']
 let $pageFigures :=
     for $pageFigureAnchor in $pageFigureAnchors
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/webapp/mpdl/presentation/insertExternalElements.xsl	Tue Feb 22 16:03:45 2011 +0100
@@ -0,0 +1,91 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="2.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:xs="http://www.w3.org/2001/XMLSchema"
+  xmlns:saxon="http://saxon.sf.net/"
+  xmlns:text="http://www.mpiwg-berlin.mpg.de/ns/mpdl/text"
+  xmlns:mpdlxmlutil="java:de.mpg.mpiwg.berlin.mpdl.util.XmlUtil"
+>
+
+<xsl:output method="xml" encoding="utf-8"/>
+<xsl:output name="myXml" indent="yes" omit-xml-declaration="yes"/>
+
+<xsl:variable name="externalElements" select="/result/externalElements"/>
+<xsl:variable name="fragment" select="/result/fragment"/>
+
+<xsl:template match="result">
+  <xsl:apply-templates/>
+</xsl:template>
+<xsl:template match="fragment">
+  <xsl:apply-templates/>
+</xsl:template>
+<xsl:template match="externalElements">
+</xsl:template>
+
+<xsl:template match="attribute()|text()|comment()|processing-instruction()">
+  <xsl:copy>
+    <xsl:apply-templates select="attribute()|element()|text()|comment()|processing-instruction()"/>
+  </xsl:copy>
+</xsl:template>
+
+<xsl:template match="element()">
+  <xsl:variable name="elemXmlNodeIdTmp" select="saxon:path(.)"/>
+  <xsl:variable name="elemXmlNodeId" select="concat('/', substring-after(substring-after(substring-after($elemXmlNodeIdTmp, '/'), '/'), '/'))"/>
+  <xsl:variable name="extElemXmlNodeIds" select="$externalElements//@xmlNodeId"/>
+  <xsl:variable name="extElemIndex" select="index-of($extElemXmlNodeIds, $elemXmlNodeId)"/>
+  <xsl:variable name="extElem" select="$externalElements/*[$extElemIndex]"/>
+  <xsl:variable name="extElemContent" select="$extElem/content/*[1]"/>
+  <xsl:variable name="extElemContentSerialized" select="saxon:serialize($extElemContent, 'myXml')"/>
+  <xsl:variable name="before">
+    <xsl:choose>
+      <xsl:when test="empty($extElem/@before) or $extElem/@before = ''">
+        <xsl:value-of select="''"/>
+      </xsl:when>
+      <xsl:otherwise>
+        <xsl:value-of select="string($extElem/@before)"/>
+      </xsl:otherwise>
+    </xsl:choose>
+  </xsl:variable>
+  <xsl:variable name="charPos">
+    <xsl:choose>
+      <xsl:when test="empty($extElem/@charPos) or $extElem/@charPos = ''">
+        <xsl:value-of select="xs:integer(-1)"/>
+      </xsl:when>
+      <xsl:otherwise>
+        <xsl:sequence select="xs:integer($extElem/@charPos)"/>
+      </xsl:otherwise>
+    </xsl:choose>
+  </xsl:variable>
+  <xsl:variable name="elemSerialized" select="saxon:serialize(., 'myXml')"/>
+  <xsl:variable name="insertedElemSerialized" select="mpdlxmlutil:insertAtCharPos(mpdlxmlutil:new(), $elemSerialized, string($charPos), $extElemContentSerialized)"/>
+  <xsl:variable name="insertedElemDeSerializedTmp" select="saxon:parse($insertedElemSerialized)"/>
+  <xsl:variable name="insertedElemDeSerialized">
+    <xsl:element name="{name()}">
+      <xsl:apply-templates select="attribute()"/>
+      <xsl:attribute name="xmlNodeId">
+        <xsl:value-of select="$elemXmlNodeId"/>
+      </xsl:attribute>
+      <xsl:sequence select="$insertedElemDeSerializedTmp/*[1]/node()"/>
+    </xsl:element>
+  </xsl:variable>
+  <xsl:if test="$charPos &lt; 0 and $before = 'true'">
+    <xsl:sequence select="$extElemContent"/>
+  </xsl:if>
+  <xsl:if test="$charPos &gt;= 0">
+    <xsl:sequence select="$insertedElemDeSerialized"/>
+  </xsl:if>
+  <xsl:if test="$charPos &lt; 0">
+    <xsl:copy>
+      <xsl:apply-templates select="attribute()"/>
+      <xsl:attribute name="xmlNodeId">
+        <xsl:value-of select="$elemXmlNodeId"/>
+      </xsl:attribute>
+      <xsl:apply-templates select="element()|text()|comment()|processing-instruction()"/>
+    </xsl:copy>
+  </xsl:if>
+  <xsl:if test="$charPos &lt; 0 and $before = 'false'">
+    <xsl:sequence select="$extElemContent"/>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
--- a/software/eXist/webapp/mpdl/presentation/pageFragmentHtml.xsl	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/webapp/mpdl/presentation/pageFragmentHtml.xsl	Tue Feb 22 16:03:45 2011 +0100
@@ -43,6 +43,7 @@
     <xsl:variable name="pageHeader" select="header"/>
     <xsl:variable name="pageNumber" select="number(number)"/>
     <xsl:variable name="pageNumberOrig" select="number-orig"/>
+    <xsl:variable name="pageNumberOrigNorm" select="number-orig-norm"/>
     <xsl:variable name="documentValue" select="concat('document=', $documentUri)"/>
     <xsl:variable name="pnValue" select="concat('pn=', $pageNumber)"/>
     <xsl:variable name="modeValue" select="concat('mode=', $mode)"/>
@@ -58,6 +59,10 @@
         <xsl:when test="$pageNumberOrig = ''"></xsl:when>
         <xsl:otherwise><div class="pageNumberOrig"><xsl:value-of select="$pageNumberOrig"/></div></xsl:otherwise>
       </xsl:choose>
+      <xsl:choose>
+        <xsl:when test="$pageNumberOrigNorm = ''"></xsl:when>
+        <xsl:otherwise><div class="pageNumberOrigNorm"><xsl:value-of select="$pageNumberOrigNorm"/></div></xsl:otherwise>
+      </xsl:choose>
       <xsl:if test="$pageHeader != ''">
         <div class="pageHeaderTitle"><xsl:value-of select="$pageHeader"/></div>
       </xsl:if>
@@ -282,6 +287,21 @@
   <p class="bf center"><xsl:apply-templates mode="text"/></p>
 </xsl:template>

+<!-- TEI: segmentation   -->
+<xsl:template match="seg" mode="text">
+  <span class="seg">
+    <xsl:if test="not(empty(@xlink:href))">
+      <a>
+        <xsl:attribute name="href"><xsl:value-of select="@xlink:href"/></xsl:attribute>
+        <xsl:apply-templates mode="text"/>
+      </a>
+    </xsl:if>
+    <xsl:if test="empty(@xlink:href)">
+      <xsl:apply-templates mode="text"/>
+    </xsl:if>
+  </span>
+</xsl:template>
+
 <xsl:template match="div" mode="text">
   <xsl:variable name="type" select="@type"/>
   <xsl:variable name="level" select="@level"/>
@@ -401,8 +421,19 @@
   <xsl:if test="$collectionName = 'archimedes'"><xsl:apply-templates mode="text"/></xsl:if>
 </xsl:template>

+<!-- TEI: reference    -->
 <xsl:template match="ref" mode="text">
-  <span class="ref"><xsl:apply-templates mode="text"/></span>
+  <span class="ref">
+    <xsl:if test="not(empty(@target))">
+      <a>
+        <xsl:attribute name="href"><xsl:value-of select="@target"/></xsl:attribute>
+        <xsl:apply-templates mode="text"/>
+      </a>
+    </xsl:if>
+    <xsl:if test="empty(@target)">
+      <xsl:apply-templates mode="text"/>
+    </xsl:if>
+  </span>
 </xsl:template>

 <xsl:template match="foreign" mode="text">
@@ -510,7 +541,7 @@
         <xsl:value-of select="concat('↓ (', $href, ')')"/>
       </a>
     </xsl:when>
-    <xsl:otherwise><a><xsl:attribute name="href"><xsl:value-of select="@xlink:href"/></xsl:attribute>Anchor of type: <xsl:value-of select="@type"/>, href: <xsl:value-of select="@xlink:href"/></a></xsl:otherwise>
+    <xsl:otherwise><a><xsl:attribute name="href"><xsl:value-of select="@xlink:href"/></xsl:attribute><xsl:apply-templates mode="text"/></a></xsl:otherwise>
   </xsl:choose>
 </xsl:template>

@@ -661,13 +692,13 @@
     <!-- Show the sentence in color light grey if it is given as sn -->
     <xsl:when test="$sn >= 0 and $sn = $actualSN">
       <span class="s highlight">
-        <xsl:if test="contains($options, 'withXmlNodeId')"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
+        <xsl:if test="not(empty(@xmlNodeId))"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
         <xsl:apply-templates mode="text"/>
       </span>
     </xsl:when>
     <xsl:otherwise>
       <span class="s">
-        <xsl:if test="contains($options, 'withXmlNodeId')"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
+        <xsl:if test="not(empty(@xmlNodeId))"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
         <xsl:apply-templates mode="text"/>
       </span>
     </xsl:otherwise>
--- a/software/eXist/webapp/mpdl/presentation/pageHtml.xsl	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/webapp/mpdl/presentation/pageHtml.xsl	Tue Feb 22 16:03:45 2011 +0100
@@ -813,6 +813,21 @@
   <p class="bf center"><xsl:apply-templates mode="text"/></p>
 </xsl:template>

+<!-- TEI: segmentation   -->
+<xsl:template match="seg" mode="text">
+  <span class="seg">
+    <xsl:if test="not(empty(@xlink:href))">
+      <a>
+        <xsl:attribute name="href"><xsl:value-of select="@xlink:href"/></xsl:attribute>
+        <xsl:apply-templates mode="text"/>
+      </a>
+    </xsl:if>
+    <xsl:if test="empty(@xlink:href)">
+      <xsl:apply-templates mode="text"/>
+    </xsl:if>
+  </span>
+</xsl:template>
+
 <xsl:template match="div" mode="text">
   <xsl:variable name="type" select="@type"/>
   <xsl:variable name="level" select="@level"/>
@@ -932,8 +947,19 @@
   <xsl:if test="$collectionName = 'archimedes'"><xsl:apply-templates mode="text"/></xsl:if>
 </xsl:template>

+<!-- TEI: reference    -->
 <xsl:template match="ref" mode="text">
-  <span class="ref"><xsl:apply-templates mode="text"/></span>
+  <span class="ref">
+    <xsl:if test="not(empty(@target))">
+      <a>
+        <xsl:attribute name="href"><xsl:value-of select="@target"/></xsl:attribute>
+        <xsl:apply-templates mode="text"/>
+      </a>
+    </xsl:if>
+    <xsl:if test="empty(@target)">
+      <xsl:apply-templates mode="text"/>
+    </xsl:if>
+  </span>
 </xsl:template>

 <xsl:template match="foreign" mode="text">
@@ -1041,7 +1067,7 @@
         <xsl:value-of select="concat('↓ (', $href, ')')"/>
       </a>
     </xsl:when>
-    <xsl:otherwise><a><xsl:attribute name="href"><xsl:value-of select="$href"/></xsl:attribute>Anchor of type: <xsl:value-of select="$type"/>, href: <xsl:value-of select="$href"/></a></xsl:otherwise>
+    <xsl:otherwise><a><xsl:attribute name="href"><xsl:value-of select="$href"/></xsl:attribute><xsl:apply-templates mode="text"/></a></xsl:otherwise>
   </xsl:choose>
 </xsl:template>

@@ -1191,13 +1217,13 @@
     <!-- Show the sentence in color light grey if it is given as sn -->
     <xsl:when test="$sn >= 0 and $sn = $actualSN">
       <span class="s highlight">
-        <xsl:if test="contains($options, 'withXmlNodeId')"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
+        <xsl:if test="not(empty(@xmlNodeId))"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
         <xsl:apply-templates mode="text"/>
       </span>
     </xsl:when>
     <xsl:otherwise>
       <span class="s">
-        <xsl:if test="contains($options, 'withXmlNodeId')"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
+        <xsl:if test="not(empty(@xmlNodeId))"><xsl:attribute name="xmlNodeId"><xsl:value-of select="@xmlNodeId"/></xsl:attribute></xsl:if>
         <xsl:apply-templates mode="text"/>
       </span>
     </xsl:otherwise>
--- a/software/eXist/webapp/mpdl/text/all.xql	Thu Feb 10 14:02:05 2011 +0100
+++ b/software/eXist/webapp/mpdl/text/all.xql	Tue Feb 22 16:03:45 2011 +0100
@@ -35,69 +35,6 @@
     }
 };

-declare function mpdl-text:insert($fragment as element(), $externalObjects as element()*) {
-  let $firstObject := $externalObjects[1]
-  let $xmlNodeId := $firstObject/@xmlNodeId
-  let $posNode := $fragment//*[@xmlNodeId = $xmlNodeId]
-  let $before := $firstObject/@before
-  let $boolBefore :=
-    if ($before = "true")
-    then true()
-    else false()
-  let $charPosStr := $firstObject/@charPos
-  let $charPos :=
-    if($charPosStr != "" and not(empty($charPosStr)))
-    then number($charPosStr)
-    else -1
-  let $newNode := $firstObject/content
-  let $size := count($externalObjects)
-  let $otherObjects :=
-    if ($size > 1)
-    then subsequence($externalObjects, 2, $size)
-    else ()
-  let $insertedFragment := mpdl-text:insert($fragment, $posNode, $boolBefore, $charPos, $newNode)
-  let $result :=
-    if ($size >= 1)
-    then
-      mpdl-text:insert($insertedFragment, $otherObjects)
-    else
-      $fragment
-    return $result
-};
-
-declare function mpdl-text:insert($element as element(), $node, $before, $charPos, $newNode) {
-  if ($element = $node and $before and $charPos = -1)
-  then
-  ($newNode,
-  element {node-name($node)}
-    {$node/@*,
-     for $child in $node/node()
-        return if ($child instance of element())
-          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
-          else $child
-    })
-  else if ($element = $node and not($before) and $charPos = -1)
-  then
-  (element {node-name($node)}
-    {$node/@*,
-     for $child in $node/node()
-        return if ($child instance of element())
-          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
-          else $child
-    }, $newNode)
-  else if ($element = $node and $charPos >= 0)
-  then
-    util:parse(mpdltext:insertAtCharPos(util:serialize($node, ()), util:serialize($newNode, ()), $charPos))
-  else
-  element {node-name($element)}
-    {$element/@*,
-     for $child in $element/node()
-        return if ($child instance of element())
-          then mpdl-text:insert($child, $node, $before, $charPos, $newNode)
-          else $child
-    }
-};
-
 declare function mpdl-text:indexTerms($mpdlCollectionName, $language, $document, $indexTermsStartStr, $pn as xs:int, $pageSize as xs:int) as node()* {
   let $index :=
     if ($mpdlCollectionName = 'archimedes')