Mercurial > hg > mpdl-group

--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/client/DocumentHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -1,6 +1,5 @@
 package de.mpg.mpiwg.berlin.mpdl.client;

-
 import java.io.File;
 import java.io.FilenameFilter;
 import java.net.MalformedURLException;
@@ -368,4 +367,4 @@
     endOfOperation = new Date().getTime();
   }

-}
+}
\ No newline at end of file
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/escidoc/TestESciDoc.java	Tue Feb 08 14:54:09 2011 +0100
@@ -61,11 +61,11 @@
       TestESciDoc test = new TestESciDoc();
       test.init("jwillenborg");  // init eSciDoc-Session with cookie as user jwillenborg

-      // test.grant("aeisemann", "admin");
-      String uid = test.getUserId("aeisemann");
+      // test.grant("urte", "admin");
+      String uid = test.getUserId("urte");
       String users = test.getAllUsers();
-      String grantAdmin = test.getGrantHrefByUserNameAndRoleName("aeisemann", "escidoc:role-system-administrator");
-      String grants = test.getGrantsByUserName("aeisemann");
+      String grantAdmin = test.getGrantHrefByUserNameAndRoleName("urte", "escidoc:role-system-administrator");
+      String grants = test.getGrantsByUserName("urte");
       String bla = "";

       // test.testSchemaValidation();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtElement.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,121 @@
+package de.mpg.mpiwg.berlin.mpdl.externalObjects.app;
+
+import java.util.Date;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
+
+public class ExtElement extends ExtObject {
+  private String pageNumber;
+  private String xmlNodeId;
+  private String before;
+  private String charPos;
+  private String xpath;
+
+  public static ExtElement parseXmlStr(String xmlStr) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null);
+    String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null);
+    String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null);
+    String pageNumber = xmlUtil.evaluateToString(xmlStr, "/object/@pageNumber", null);
+    String xmlNodeId = xmlUtil.evaluateToString(xmlStr, "/object/@xmlNodeId", null);
+    String before = xmlUtil.evaluateToString(xmlStr, "/object/@before", null);
+    String charPos = xmlUtil.evaluateToString(xmlStr, "/object/@charPos", null);
+    String xpath = xmlUtil.evaluateToString(xmlStr, "/object/@xpath", null);
+    String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
+    Date modDate = xmlUtil.toDate(dateStr);
+    if (uid == null || docId == null || pageNumber == null)
+      throw new ApplicationException("one of the required fields could not be read in: " + xmlStr);
+    ExtElement e = new ExtElement();
+    e.setUid(uid);
+    e.setModificationDate(modDate);
+    e.setDocumentId(docId);
+    e.setPageNumber(pageNumber);
+    e.setXmlNodeId(xmlNodeId);
+    e.setXpath(xpath);
+    e.setBefore(before);
+    e.setCharPos(charPos);
+    e.setContent(content);
+    return e;
+  }
+
+  public String toString() {
+    return getXmlString();
+  }
+
+  public String getXmlString() {
+    String xmlString = "<object";
+    if (uid != null)
+      xmlString = xmlString + " uid=\"" + uid + "\"";
+    if (documentId != null)
+      xmlString = xmlString + " documentId=\"" + documentId + "\"";
+    if (pageNumber != null)
+      xmlString = xmlString + " pageNumber=\"" + pageNumber + "\"";
+    if (xmlNodeId != null)
+      xmlString = xmlString + " xmlNodeId=\"" + xmlNodeId + "\"";
+    if (before != null)
+      xmlString = xmlString + " before=\"" + before + "\"";
+    if (charPos != null)
+      xmlString = xmlString + " charPos=\"" + charPos + "\"";
+    if (xpath != null)
+      xmlString = xmlString + " xpath=\"" + xpath + "\"";
+    if (modificationDate != null) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      String dateStr = xmlUtil.toXsDate(modificationDate);
+      xmlString = xmlString + " modificationDate=\"" + dateStr + "\"";
+    }
+    xmlString = xmlString + ">";
+    if (content != null) {
+      // write the uid and modificationDate into the content node
+      if (! content.contains("uid")) {
+        int firstClose = content.indexOf(">");
+        if (firstClose != -1)
+          content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose);
+      }
+      xmlString = xmlString + "<content>" + content + "</content>";
+    }
+    xmlString = xmlString + "</object>";
+    return xmlString;
+  }
+
+  public String getXpath() {
+    return xpath;
+  }
+
+  public void setXpath(String xpath) {
+    this.xpath = xpath;
+  }
+
+  public String getXmlNodeId() {
+    return xmlNodeId;
+  }
+
+  public void setXmlNodeId(String xmlNodeId) {
+    this.xmlNodeId = xmlNodeId;
+  }
+
+  public String getCharPos() {
+    return charPos;
+  }
+
+  public void setCharPos(String charPos) {
+    this.charPos = charPos;
+  }
+
+  public String getPageNumber() {
+    return pageNumber;
+  }
+
+  public void setPageNumber(String pageNumber) {
+    this.pageNumber = pageNumber;
+  }
+
+  public String getBefore() {
+    return before;
+  }
+
+  public void setBefore(String before) {
+    this.before = before;
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtObject.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,43 @@
+package de.mpg.mpiwg.berlin.mpdl.externalObjects.app;
+
+import java.util.Date;
+
+public class ExtObject {
+  protected String uid;
+  protected Date modificationDate;
+  protected String documentId;
+  protected String content;
+
+  public String getUid() {
+    return uid;
+  }
+
+  public void setUid(String uid) {
+    this.uid = uid;
+  }
+
+  public Date getModificationDate() {
+    return modificationDate;
+  }
+
+  public void setModificationDate(Date modificationDate) {
+    this.modificationDate = modificationDate;
+  }
+
+  public String getDocumentId() {
+    return documentId;
+  }
+
+  public void setDocumentId(String documentId) {
+    this.documentId = documentId;
+  }
+
+  public String getContent() {
+    return content;
+  }
+
+  public void setContent(String content) {
+    this.content = content;
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExtQuery.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,83 @@
+package de.mpg.mpiwg.berlin.mpdl.externalObjects.app;
+
+import java.util.Date;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.XmlUtil;
+
+public class ExtQuery extends ExtObject {
+  private String queryType;  // url, fulltext or fulltextMorph
+  private String queryName;  // optional: name of the query
+
+  public static ExtQuery parseXmlStr(String xmlStr) throws ApplicationException {
+    XmlUtil xmlUtil = XmlUtil.getInstance();
+    String uid = xmlUtil.evaluateToString(xmlStr, "/object/@uid", null);
+    String dateStr = xmlUtil.evaluateToString(xmlStr, "/object/@modificationDate", null);
+    String docId = xmlUtil.evaluateToString(xmlStr, "/object/@documentId", null);
+    String queryType = xmlUtil.evaluateToString(xmlStr, "/object/@queryType", null);
+    String queryName = xmlUtil.evaluateToString(xmlStr, "/object/@queryName", null);
+    String content = xmlUtil.evaluateToXmlString(xmlStr, "/object/content/*", null);
+    Date modDate = xmlUtil.toDate(dateStr);
+    if (uid == null || docId == null || queryType == null || content == null)
+      throw new ApplicationException("one of the required fields could not be read in: " + xmlStr);
+    ExtQuery e = new ExtQuery();
+    e.setUid(uid);
+    e.setModificationDate(modDate);
+    e.setDocumentId(docId);
+    e.setQueryType(queryType);
+    e.setQueryName(queryName);
+    e.setContent(content);
+    return e;
+  }
+
+  public String toString() {
+    return getXmlString();
+  }
+
+  public String getXmlString() {
+    String xmlString = "<object";
+    xmlString = xmlString + " type=\"" + "query" + "\"";
+    if (uid != null)
+      xmlString = xmlString + " uid=\"" + uid + "\"";
+    if (queryType != null)
+      xmlString = xmlString + " queryType=\"" + queryType + "\"";
+    if (queryName != null)
+      xmlString = xmlString + " queryName=\"" + queryName + "\"";
+    if (modificationDate != null) {
+      XmlUtil xmlUtil = XmlUtil.getInstance();
+      String dateStr = xmlUtil.toXsDate(modificationDate);
+      xmlString = xmlString + " modificationDate=\"" + dateStr + "\"";
+    }
+    if (documentId != null)
+      xmlString = xmlString + " documentId=\"" + documentId + "\"";
+    xmlString = xmlString + ">";
+    if (content != null) {
+      // write the uid and modificationDate into the content node
+      if (! content.contains("uid")) {
+        int firstClose = content.indexOf(">");
+        if (firstClose != -1)
+          content = content.substring(0, firstClose) + " uid=\"" + uid + "\" modificationDate=\"" + modificationDate + "\" " + content.substring(firstClose);
+      }
+      xmlString = xmlString + "<content>" + content + "</content>";
+    }
+    xmlString = xmlString + "</object>";
+    return xmlString;
+  }
+
+  public String getQueryType() {
+    return queryType;
+  }
+
+  public void setQueryType(String queryType) {
+    this.queryType = queryType;
+  }
+
+  public String getQueryName() {
+    return queryName;
+  }
+
+  public void setQueryName(String queryName) {
+    this.queryName = queryName;
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/app/ExternalObjectsHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,184 @@
+package de.mpg.mpiwg.berlin.mpdl.externalObjects.app;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Date;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.externalObjects.db.DbEnvExternalObjects;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+
+public class ExternalObjectsHandler {
+  private static ExternalObjectsHandler instance;
+  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR;
+  private static String DB_DIR_EXTERNAL_OBJECTS = MPDL_DATA_DIR + "/dataBerkeleyDB/externalObjects";
+  private DbEnvExternalObjects dbEnvExternalObjects;
+  private Date beginOfOperation;
+  private Date endOfOperation;
+
+  public static ExternalObjectsHandler getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new ExternalObjectsHandler();
+      instance.init();
+    }
+    return instance;
+  }
+
+  public ArrayList<ExtElement> readExternalElements(String documentId, String pageNumber) throws ApplicationException {
+    return readDBExternalElements(documentId, pageNumber);
+  }
+
+  public void writeExternalElement(ExtElement element) throws ApplicationException {
+    writeDBExternalElement(element);
+  }
+
+  public void deleteExternalElement(ExtElement element) throws ApplicationException {
+    deleteDBExternalElement(element);
+  }
+
+  private void writeDBExternalElement(ExtElement element) throws ApplicationException {
+    try {
+      String keyStr = element.getDocumentId() + "###" + element.getPageNumber();
+      String valueStr = element.getXmlString();
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8"));
+      Database elementDB = dbEnvExternalObjects.getElementDB();
+      elementDB.put(null, dbEntryKey, dbEntryValue);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void deleteDBExternalElement(ExtElement element) throws ApplicationException {
+    try {
+      String keyStr = element.getDocumentId() + "###" + element.getPageNumber();
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      Database elementDB = dbEnvExternalObjects.getElementDB();
+      elementDB.delete(null, dbEntryKey);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private ArrayList<ExtElement> readDBExternalElements(String documentId, String pageNumber) throws ApplicationException {
+    ArrayList<ExtElement> retElements = new ArrayList<ExtElement>();
+    String hashKey = documentId + "###" + pageNumber;
+    try {
+      Database elementDB = dbEnvExternalObjects.getElementDB();
+      Cursor cursor = elementDB.openCursor(null, null);
+      byte[] bHashKey = hashKey.getBytes("utf-8");
+      DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        String foundValueStr = new String(foundValueBytes, "utf-8");
+        ExtElement e = ExtElement.parseXmlStr(foundValueStr);
+        retElements.add(e);
+        operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return retElements;
+  }
+
+  private void init() throws ApplicationException {
+    dbEnvExternalObjects = new DbEnvExternalObjects();
+    dbEnvExternalObjects.setDataDir(DB_DIR_EXTERNAL_OBJECTS);
+    dbEnvExternalObjects.init();
+    dbEnvExternalObjects.openDatabases();
+  }
+
+  public static void main(String[] args) throws ApplicationException {
+    getInstance();
+    instance.beginOperation();
+    System.out.print("Start ...");
+    // instance.deleteSampleData();
+    // instance.writeSampleData();
+    instance.readSampleData();
+    instance.end();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    System.out.println("End.");
+    System.out.println("Needed time: " + elapsedTime + " seconds");
+  }
+
+  private void deleteSampleData() throws ApplicationException {
+    ExtElement e = new ExtElement();
+    e.setUid("joe");
+    e.setDocumentId("/archimedes/it/l223.xml");
+    e.setPageNumber("17");
+    deleteExternalElement(e);
+  }
+
+  private void writeSampleData() throws ApplicationException {
+    Date now = new Date();
+
+    String sId = "1.2.2.2.2.5";
+    ExtElement e = new ExtElement();
+    e.setUid("joe");
+    e.setModificationDate(now);
+    e.setDocumentId("/archimedes/it/l223.xml");
+    e.setPageNumber("17");
+    e.setXmlNodeId(sId);
+    e.setContent("<note>This is a test note to sentence " + sId + "</note>");
+    writeExternalElement(e);
+
+    ExtElement e2 = new ExtElement();
+    String sId2 = "1.2.2.2.2.7";
+    e2.setUid("michael");
+    e2.setModificationDate(now);
+    e2.setDocumentId("/archimedes/it/l223.xml");
+    e2.setPageNumber("17");
+    e2.setXmlNodeId(sId2);
+    e2.setCharPos("18");
+    e2.setContent("<note>This is a test note to sentence " + sId2 + "</note>");
+    writeExternalElement(e2);
+
+    /*
+    String sId3 = "1.2.2.2.2.8.15.3.3";
+    e3.setUid("joe");
+    e3.setModificationDate(now);
+    e3.setDocumentId("/archimedes/it/l223.xml");
+    e3.setPageNumber("17");
+    e3.setXmlNodeId(sId3);
+    e2.setContent("<note>This is an external test note to sentence " + sId3 + "</note>");
+    writeExternalElement(e3);
+     */
+
+  }
+
+  private void readSampleData() throws ApplicationException {
+    ArrayList<ExtElement> elements = readExternalElements("/archimedes/it/l223.xml", "17");
+    System.out.println(elements);
+  }
+
+  private void end() throws ApplicationException {
+    dbEnvExternalObjects.close();
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/externalObjects/db/DbEnvExternalObjects.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,104 @@
+package de.mpg.mpiwg.berlin.mpdl.externalObjects.db;
+
+import java.io.File;
+
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseConfig;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.Environment;
+import com.sleepycat.je.EnvironmentConfig;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class DbEnvExternalObjects {
+  private String dataDir;
+  private File envPath;
+  private Environment env;
+  private EnvironmentConfig envConfig;
+  private DatabaseConfig dbConfig;
+  private Database elementDB;
+  private Database objectDB;
+
+  public DbEnvExternalObjects() {
+  }
+
+  public void setDataDir(String dataDir) {
+    this.dataDir = dataDir;
+  }
+
+  public void init() throws ApplicationException {
+    try {
+      envConfig = new EnvironmentConfig();
+      dbConfig = new DatabaseConfig();
+      envConfig.setReadOnly(false);
+      dbConfig.setReadOnly(false);
+      envConfig.setAllowCreate(true);
+      dbConfig.setAllowCreate(true);
+      envConfig.setTransactional(true);
+      dbConfig.setTransactional(true);
+      // allow duplicates for keys
+      dbConfig.setSortedDuplicates(true);
+      envPath = new File(dataDir);
+      env = new Environment(envPath, envConfig);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  public void openDatabases() throws ApplicationException {
+    try {
+      // open databases (and create them if they do not exist)
+      elementDB = env.openDatabase(null, "ElementDB", dbConfig);
+      objectDB = env.openDatabase(null, "ObjectDB", dbConfig);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  public void removeDatabases() throws ApplicationException {
+    try {
+      if (objectDB != null)
+        objectDB.close();
+      if (elementDB != null)
+        elementDB.close();
+      env.removeDatabase(null, "ElementDB");
+      env.removeDatabase(null, "ObjectDB");
+      objectDB = null;
+      elementDB = null;
+      /*
+      env.truncateDatabase(null, "ElementDB", bla);
+      env.truncateDatabase(null, "ObjectDB", bla);
+      */
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  public Environment getEnv() {
+    return env;
+  }
+
+  public Database getElementDB() {
+    return elementDB;
+  }
+
+  public Database getObjectDB() {
+    return objectDB;
+  }
+
+  public void close() throws ApplicationException {
+    if (env != null) {
+      try {
+        if (objectDB != null)
+          objectDB.close();
+        if (elementDB != null)
+          elementDB.close();
+        if (env != null)
+          env.close();
+      } catch (DatabaseException e) {
+        throw new ApplicationException(e);
+      }
+    }
+  }
+}
+
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/general/MpdlConstants.java	Tue Feb 08 14:54:09 2011 +0100
@@ -13,7 +13,7 @@
   public static String MPDL_PROJECT_NAME = "mpdl";
   public static String TYPE_STATIC = "static";
   public static String TYPE_DYNAMIC = "dynamic";
-
+
   // eXist settings: data
   public static String MPDL_EXIST_DATA_DIR = EXIST_HOME + "/webapp/WEB-INF/dataMpdl";  // other call would be: ConfigurationHelper.getExistHome()
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java	Tue Feb 08 14:54:09 2011 +0100
@@ -1,23 +1,19 @@
 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.UnsupportedEncodingException;
+import java.io.StringReader;
 import java.util.ArrayList;

 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll;
 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization;
 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager;
 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;

 public class MpdlNormalizer {
-  static final private String IT_VOWELS = "AEIOUaeiou" +
-                                          "\u00c6\u00e6" + // AE ligatures
-                                          "\u0152\u0153";  // OE ligatures
-  static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" +
-                                        "bcdfghklmnpqrstvwxz" +
-                                        "\u017f\u00df";  // long/sharp S
+  public static int MODE_4LEXICA = 1;  // normalization for lexica etc. which have sometimes only ascii in it
+  public static int MODE_4HUMAN_READERS = 2;  // normalization for human readers
+  private int normMode = MODE_4LEXICA;  // Default
   private String[] normFunctionsToUse = {"reg", "norm"};  // default is to use all of these normalization functions
   private String language;
   private int[] offsets;
@@ -32,6 +28,10 @@
     this.language = language;
   }

+  public void setNormMode(int normMode) {
+    this.normMode = normMode;
+  }
+
   /**
    * Applies the normalization rules in <code>language</code> to
    * <code>s</code>, without offset tracking.
@@ -52,8 +52,11 @@
       }
     }
     if (useNormFunction()) {
-      // normalize the string by string replace
-      normStr = normalize(normStr, null);
+      // normalize the string by string replacements
+      if (normMode == MODE_4LEXICA)
+        normStr = normalize4Lexica(normStr, null);
+      else if (normMode == MODE_4HUMAN_READERS)
+        normStr = normalize4HumanReaders(normStr);
     }
     return normStr;
   }
@@ -92,7 +95,7 @@
    * @param offsets      character offset table
    * @return             normalized string
    */
-  public String normalize(String s, int[] offsets) {
+  private String normalize4Lexica(String s, int[] offsets) {
     this.offsets = offsets;
     if (language.equals("la") || language.equals("lat")) {
       StringBuffer buf = new StringBuffer();
@@ -479,9 +482,11 @@
           case '\u00e4': replace = "ae"; break;
           case '\u00f6': replace = "oe"; break;
           case '\u00fc': replace = "ue"; break;
+          case '\u00ad': break; // soft hyphen
           case '\u00e9': replace = "e"; break;
-          case '\u00ad': break; // soft hyphen
-          case '-': break;
+          // new in MPDL project by J. Willenborg
+          case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+          // case '-': break;
           default: replace += c; break;
         }
         buf.append(replace);
@@ -1007,16 +1012,126 @@
       return s;
     }
   }
-
-  /**
-   * Returns the offset table.
-   *
-   * @return             offset table
-   */
-  public int[] getOffsetTable() {
-    return offsets;
+
+  private String normalize4HumanReaders(String s) {
+    String normStr = s;
+    StringReader strReader = new StringReader(normStr + "\n");
+    MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader);
+    if (Language.getInstance().isLatin(language)) {
+      mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA);
+    } else if (Language.getInstance().isChinese(language)) {
+      mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH);
+    } else {
+      // TODO normalization for all languages
+      return normalize4Lexica(s, null);  // old function
+    }
+    String retStr = "";
+    String token = "";
+    while (token != null) {
+      try {
+        token = mpdlNormalizerLexAll.yylex();
+        if (token != null)
+          retStr += token;
+      } catch (IOException e ) {
+        // nothing cause IOException is not needed for a StringReader
+      }
+    }
+    normStr = retStr;
+    return normStr;
   }

+  /*
+  // explicit words
+  normStr = normStr.replaceAll("aliàs", "alias");
+  normStr = normStr.replaceAll("hîc", "hic");
+  normStr = normStr.replaceAll("quòd", "quod");
+  normStr = normStr.replaceAll("Quòd", "Quod");
+  normStr = normStr.replaceAll("QVòd", "Quod");
+  normStr = normStr.replaceAll("Cùmque", "Cumque");
+  normStr = normStr.replaceAll("aër", "aer");
+  // ij
+  normStr = normStr.replaceAll("ij", "ii");
+  // qu/qv
+  normStr = normStr.replaceAll("qv", "qu");
+  // normStr = normStr.replaceAll("qV", "qU");
+  normStr = normStr.replaceAll("Qv", "Qu");
+  normStr = normStr.replaceAll("QV", "QU");
+  // u/v
+  String vowels = getVowels();
+  String consonants = getConsonants();
+  normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3");  // vowel + u + vowel --> vowel + v + vowel
+  normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3");  // vowel + U + vowel --> vowel + V + vowel
+  normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3");  // consonant + v + consonant --> consonant + u + consonant
+  normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3");  // consonant + V + consonant --> consonant + U + consonant
+  normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1");  // v + consonant --> u + consonant
+  normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1");  // V + consonant --> U + consonant
+  // end of word: diacritica
+  normStr = normStr.replaceAll("à$", "a");
+  normStr = normStr.replaceAll("è$", "e");
+  normStr = normStr.replaceAll("ò$", "o");
+  normStr = normStr.replaceAll("àm$", "am");
+  normStr = normStr.replaceAll("ùm$", "um");
+  String normStrTmp = normStr;
+  normStr = "";
+  for (int i = 0; i < normStrTmp.length(); i++) {
+    char c = normStrTmp.charAt(i);
+    String replace = "";
+    switch (c) {
+      case 'ſ': replace = "s"; break;
+      case 'ß': replace = "ss"; break;
+      case 'æ': replace = "ae"; break;
+      case 'Æ': replace = "AE"; break;
+      case 'ę': replace = "ae"; break;
+      case 'œ': replace = "oe"; break;
+      default: replace += c; break;
+    }
+    normStr = normStr + replace;
+  }
+
+
+  private String getVowels() {
+    String retStr = null;
+    if (Language.getInstance().isItalian(language)) {
+      retStr = "AEIOUaeiou" +
+               "\u00c6\u00e6" + // AE ligatures
+               "\u0152\u0153";  // OE ligatures
+    } else if (Language.getInstance().isLatin(language)) {
+      retStr = "AEIOUaeiouÆœęàèòù";
+    }
+    // TODO all languages
+    return retStr;
+  }
+
+  private String getConsonants() {
+    String retStr = null;
+    if (Language.getInstance().isItalian(language)) {
+      retStr = "BCDFGHKLMNPQRSTVWXZ" +
+               "bcdfghklmnpqrstvwxz" +
+               "ſß";  // long/sharp S
+    } else if (Language.getInstance().isLatin(language)) {
+      retStr = "BCDFGHKLMNPQRSTVWXZ" +
+               "bcdfghklmnpqrstvwxz" +
+               "ſß";  // long/sharp S
+    }
+    // TODO all languages
+    return retStr;
+  }
+
+
+
+
+
+  *
+  *
+  *
+  *
+  */
+
+
+
+
+
+
   /**
    * Returns a copy of an integer array with the element at
    * <code>index</code> removed ("killed").
@@ -1024,7 +1139,7 @@
    * @param array        integer array
    * @param index        index of element to remove
    */
-  static private int[] arrayKill(int[] array, int index) {
+  private int[] arrayKill(int[] array, int index) {
     int[] newArray = new int[array.length - 1];
     System.arraycopy(array, 0, newArray, 0, index);
     System.arraycopy(array, index + 1, newArray, index, array.length - index - 1);
@@ -1040,7 +1155,7 @@
    * @param value        value to insert into new slots
    * @param count        number of new slots to insert
    */
-  static private int[] arrayInsert(int[] array, int index, int value, int count) {
+  private int[] arrayInsert(int[] array, int index, int value, int count) {
     int[] newArray = new int[array.length + count];
     System.arraycopy(array, 0, newArray, 0, index);
     for (int i = 0; i < count; i++) newArray[index + i] = value;
@@ -1048,31 +1163,4 @@
     return newArray;
   }

-  /**
-   * We provide <code>main()</code> so that our services will be available
-   * outside Java (i.e., so we can run as a Un*x-style filter).
-   */
-  static public void main(String[] argv) throws ApplicationException {
-    if (argv.length != 1) {
-      System.err.println("You must specify a language.");
-      System.exit(1);
-    }
-    String rec;
-    StringBuffer buf = new StringBuffer();
-    BufferedReader bin = null;
-    try {
-      bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
-      while ((rec = bin.readLine()) != null)
-        buf.append(rec + "\n");
-    }
-    catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      System.exit(1);
-    } catch (IOException e) {
-      System.err.println(e);
-      System.exit(1);
-    }
-    MpdlNormalizer orth = new MpdlNormalizer(argv[0]);
-    System.out.print(orth.normalize(buf.toString()));
-  }
 }
\ No newline at end of file
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java	Tue Feb 08 14:54:09 2011 +0100
@@ -11,11 +11,14 @@
 public class MpdlTokenizer extends Tokenizer {
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 1024;
-  private String language;  // TODO make the tokenizer language dependent
+  private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
+  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
+  private boolean isInNotWordDelimMode = false;
   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private char[] buffer = new char[MAX_WORD_LEN];
   private char[] ioBuffer = new char[IO_BUFFER_SIZE];
   private MpdlNormalizer normalizer;
+  private String language;

   public MpdlTokenizer(Reader input, String language) {
     super(input);
@@ -28,12 +31,22 @@
     this.normalizer = normalizer;
   }

+  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
+    this.regWithoutSemicolon = regWithoutSemicolon;
+  }
+
+  public boolean isRegWithoutSemicolon() {
+    return regWithoutSemicolon;
+  }
+
   /** Returns true iff a character should be included in a token.  This
    * tokenizer generates as tokens adjacent sequences of characters which
    * satisfy this predicate.  Characters for which this is false are used to
    * define token boundaries and are not included in tokens. */
   protected boolean isTokenChar(char c) {
     boolean isTokenChar = true;
+    if (isRegWithoutSemicolon() && c == ';')  // hack: special case for regularization and normalization; feel free to remove it later
+      return true;
     switch (c) {
       case ' ': isTokenChar = false; break;
       case '.': isTokenChar = false; break;
@@ -51,12 +64,37 @@
       case '&': isTokenChar = false; break;
       case '+': isTokenChar = false; break;
       case '"': isTokenChar = false; break;
+      case '„': isTokenChar = false; break;
+      case '“': isTokenChar = false; break;
+      case '«': isTokenChar = false; break;
+      case '»': isTokenChar = false; break;
       case '\'': isTokenChar = false; break;
-      // case '\t': isTokenChar = false; break;
-      // case '\n': isTokenChar = false; break;  // do not break words which are on another line
+      case '\t': isTokenChar = false; break; // do not break words which have tabs in it
+      case '\n': isTokenChar = false; break;  // do not break words which are on another line
     }
     return isTokenChar;
   }
+
+  protected boolean isTokenCharInNotWordDelimMode(char c) {
+    boolean isTokenCharInNotWordDelimMode = false;
+    if (isInNotWordDelimMode) {
+      switch (c) {
+        case ' ': isTokenCharInNotWordDelimMode = true; break;
+        case '\t': isTokenCharInNotWordDelimMode = true; break;
+        case '\n': isTokenCharInNotWordDelimMode = true; break;
+      }
+    }
+    return isTokenCharInNotWordDelimMode;
+  }
+
+  protected boolean isSpecialNotWordDelimSymbol(char c) {
+    boolean isSpecialNotWordDelimSymbol = false;
+    switch (c) {
+      case '\u2424': isSpecialNotWordDelimSymbol = true; break;  // unicode character for newline
+    }
+    return isSpecialNotWordDelimSymbol;
+  }
+

   /** Called on each token character to normalize it before it is added to the
    * token.  The default implementation does nothing.  Subclasses may use this
@@ -67,6 +105,8 @@

   /** Returns the next token in the stream, or null at EOS. */
   public final Token next() throws IOException {
+    if (language != null && language.equals("zh"))
+      return nextChinese();
     int length = 0;
     int start = offset;
     while (true) {
@@ -84,7 +124,13 @@
       } else {
         c = ioBuffer[bufferIndex++];
       }
-      if (isTokenChar(c)) {              // if it's a token char
+      if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) {
+        isInNotWordDelimMode = false;
+      }
+      if (isSpecialNotWordDelimSymbol(c)) {
+        isInNotWordDelimMode = true;
+      }
+      if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) {              // if it's a token char
         if (length == 0)                 // start of token
           start = offset - 1;
         buffer[length++] = normalize(c); // buffer it, normalized
@@ -93,8 +139,10 @@
       } else if (length > 0)             // at non-Letter w/ chars
         break;                           // return 'em
     }
+    isInNotWordDelimMode = false;
     Token newToken = new Token(start, start + length);
     newToken.setTermBuffer(buffer, 0, length);
+    removeSpecialSymbols(newToken);  // remove some special symbols in token (e.g. symbol for word delimiting xml elements)
     if (normalizer != null) {
       char[] termBuffer = newToken.termBuffer();
       int termBufferLength = newToken.termLength();
@@ -110,4 +158,75 @@
     }
     return newToken;
   }
+
+  private Token removeSpecialSymbols(Token token) {
+    char[] termBuffer = token.termBuffer();
+    int termBufferLength = token.termLength();
+    String tokenText = new String(termBuffer, 0, termBufferLength);
+    String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");  // a symbol which marks word delimiting xml elements
+    int newTokenTextLength = newTokenText.length();
+    char[] newTokenTextBuffer = newTokenText.toCharArray();
+    token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength);
+    return token;
+  }
+
+
+
+  /*
+   * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer
+   *
+   */
+  private int length;
+  private int start;
+
+  private final void push(char c) {
+    if (length == 0) start = offset-1;            // start of token
+    buffer[length++] = Character.toLowerCase(c);  // buffer it
+  }
+
+  private final Token flush() {
+    if (length>0) {
+      return new Token(new String(buffer, 0, length), start, start+length);
+    }
+    else
+      return null;
+  }
+
+  public final Token nextChinese() throws IOException {
+    length = 0;
+    start = offset;
+    while (true) {
+      final char c;
+      offset++;
+      if (bufferIndex >= dataLen) {
+        dataLen = input.read(ioBuffer);
+        bufferIndex = 0;
+      }
+      if (dataLen == -1)
+        return flush();
+      else
+        c = ioBuffer[bufferIndex++];
+      switch(Character.getType(c)) {
+        case Character.DECIMAL_DIGIT_NUMBER:
+        case Character.LOWERCASE_LETTER:
+        case Character.UPPERCASE_LETTER:
+          push(c);
+          if (length == MAX_WORD_LEN)
+            return flush();
+          break;
+        case Character.OTHER_LETTER:
+          if (length>0) {
+            bufferIndex--;
+            offset--;
+            return flush();
+          }
+          push(c);
+          return flush();
+        default:
+          if (length>0)
+            return flush();
+          break;
+      }
+    }
+  }
 }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java	Tue Feb 08 14:54:09 2011 +0100
@@ -16,6 +16,7 @@
 public class MpdlTokenizerAnalyzer extends Analyzer {
   protected String language = MpdlConstants.DEFAULT_LANGUAGE;
   protected MpdlNormalizer normalizer = null;
+  private boolean regWithoutSemicolon = false;  // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon

   public MpdlTokenizerAnalyzer(String language) {
     this.language = language;
@@ -27,8 +28,18 @@
     this.normalizer = normalizer;
   }

+  public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
+    this.regWithoutSemicolon = regWithoutSemicolon;
+  }
+
+  public boolean isRegWithoutSemicolon() {
+    return regWithoutSemicolon;
+  }
+
   public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new MpdlTokenizer(reader, language, normalizer);
+    MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
+    tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
+    TokenStream result = (TokenStream) tmpTokenizer;
     result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
     result = new LowerCaseFilter(result);
     return result;
@@ -38,7 +49,9 @@
     ArrayList<Token> token = new ArrayList<Token>();
     try {
       Reader reader = new StringReader(inputString);
-      TokenStream result = new MpdlTokenizer(reader, language, normalizer);
+      MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
+      tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon);  // hack: feel free to remove it later
+      TokenStream result = (TokenStream) tmpTokenizer;
       result = new MpdlFilter(result);  // filter to remove the hyphen in a token etc.
       result = new LowerCaseFilter(result);
       Token t = result.next();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,820 @@
+/* The following code was generated by JFlex 1.4.3 on 27.01.11 13:29 */
+
+/*
+ * Normalization rules for all languages
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * 2011-01-25
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 27.01.11 13:29 from the specification file
+ * <tt>MpdlNormalizerLexAll.lex</tt>
+ */
+public class MpdlNormalizerLexAll {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+  public static final int ZH = 4;
+  public static final int LA = 2;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0,  0,  1,  2,  3, 3
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\7\66\0\1\1\3\2\1\1\3\2\1\1\1\0\1\2"+
+    "\1\3\2\2\1\1\1\2\1\40\1\3\2\2\1\36\1\41\2\2"+
+    "\1\0\1\2\6\0\1\31\1\2\1\21\1\27\1\6\2\2\1\17"+
+    "\1\15\1\16\1\2\1\3\1\23\1\2\1\33\1\2\1\4\1\3"+
+    "\1\24\1\2\1\5\1\37\2\2\1\0\1\2\113\0\1\13\30\0"+
+    "\1\11\1\22\5\0\1\12\1\0\1\25\2\0\1\32\2\0\1\20"+
+    "\1\34\2\0\1\26\6\0\1\30\2\0\1\35\34\0\1\12\71\0"+
+    "\1\14\53\0\1\10\u6479\0\1\43\u057a\0\1\44\u0f5d\0\1\42\u5dab\0"+
+    "\1\46\u040e\0\1\47\u1d8f\0\1\45\u05e2\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\4\0\1\1\1\2\1\3\1\4\3\5\1\4\1\6"+
+    "\1\7\1\10\1\11\1\12\1\13\10\4\4\5\1\14"+
+    "\1\15\1\16\1\17\1\1\1\20\1\21\1\22\1\23"+
+    "\1\24\1\25\1\26\1\0\1\27\3\0\1\30\1\0"+
+    "\1\31\3\0\1\32\1\33\1\34\1\35\1\36\1\37"+
+    "\1\0\1\40\2\0\1\41\1\0\1\42\3\0\1\43"+
+    "\1\0\1\44\1\0\1\45\11\0\1\46\5\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\50\0\120\0\170\0\240\0\240\0\240\0\240"+
+    "\0\240\0\310\0\360\0\u0118\0\240\0\240\0\240\0\240"+
+    "\0\240\0\240\0\u0140\0\u0168\0\u0190\0\u01b8\0\u01e0\0\u0208"+
+    "\0\u0230\0\u0258\0\u0280\0\u02a8\0\u02d0\0\u02f8\0\240\0\240"+
+    "\0\240\0\240\0\u0320\0\240\0\240\0\240\0\240\0\240"+
+    "\0\240\0\240\0\u0348\0\240\0\u0370\0\u0398\0\u03c0\0\240"+
+    "\0\u03e8\0\240\0\u0410\0\u0438\0\u0460\0\240\0\240\0\240"+
+    "\0\240\0\240\0\240\0\u0488\0\240\0\u04b0\0\u04d8\0\240"+
+    "\0\u0500\0\240\0\u0528\0\u0550\0\u0578\0\240\0\u05a0\0\240"+
+    "\0\u05c8\0\240\0\u05f0\0\u0618\0\u0640\0\u0668\0\u0690\0\u06b8"+
+    "\0\u06e0\0\u0708\0\u0730\0\240\0\u0758\0\u0780\0\u07a8\0\u07d0"+
+    "\0\u07f8";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\7\5\1\6\40\5\1\7\1\10\1\11\1\12\1\13"+
+    "\1\14\1\10\1\15\1\16\1\17\1\20\1\21\1\22"+
+    "\1\23\1\7\1\11\1\7\1\11\1\24\2\11\1\25"+
+    "\1\26\1\11\1\27\1\30\1\7\1\31\2\7\1\32"+
+    "\1\33\1\34\1\35\7\7\1\10\1\11\1\12\1\13"+
+    "\1\14\1\10\1\15\1\16\1\17\1\20\1\21\1\22"+
+    "\1\23\1\7\1\36\1\7\1\11\1\24\2\11\1\25"+
+    "\1\26\1\11\1\27\1\30\1\7\1\31\2\7\1\32"+
+    "\1\33\1\34\1\35\6\7\7\5\1\6\32\5\1\37"+
+    "\1\40\1\41\1\42\1\43\1\5\107\0\1\44\1\0"+
+    "\1\45\45\0\1\46\11\0\1\47\3\0\2\47\3\0"+
+    "\4\47\4\0\1\47\2\0\2\47\1\0\2\47\1\0"+
+    "\1\47\1\50\1\51\1\47\27\0\1\52\35\0\1\53"+
+    "\2\0\1\54\13\0\1\55\1\56\27\0\1\57\2\0"+
+    "\1\60\44\0\1\61\2\0\1\62\17\0\1\63\43\0"+
+    "\1\64\1\65\55\0\1\20\47\0\1\22\1\0\1\66"+
+    "\14\0\1\67\3\0\2\67\3\0\4\67\4\0\1\67"+
+    "\2\0\2\67\1\0\2\67\1\0\1\67\2\0\1\67"+
+    "\13\0\3\70\3\0\2\70\5\0\1\70\1\0\1\70"+
+    "\1\0\2\70\2\0\1\70\7\0\3\70\45\0\1\71"+
+    "\1\0\1\72\10\0\3\73\3\0\2\73\5\0\1\73"+
+    "\1\0\1\73\1\0\2\73\2\0\1\73\7\0\3\73"+
+    "\26\0\1\74\76\0\1\75\5\0\1\76\46\0\1\77"+
+    "\2\0\1\100\44\0\1\101\2\0\1\102\45\0\1\103"+
+    "\47\0\1\104\46\0\1\105\2\0\1\106\44\0\1\107"+
+    "\2\0\1\110\44\0\1\111\2\0\1\112\61\0\1\113"+
+    "\34\0\1\114\46\0\1\115\47\0\1\116\50\0\1\117"+
+    "\47\0\1\120\46\0\1\121\47\0\1\122\47\0\1\123"+
+    "\51\0\1\124\47\0\1\54\46\0\1\125\47\0\1\126"+
+    "\50\0\1\60\47\0\1\62\46\0\1\127\47\0\1\130"+
+    "\47\0\1\131\50\0\1\100\47\0\1\102\47\0\1\106"+
+    "\47\0\1\110\47\0\1\112\40\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[2080];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\4\0\5\11\3\1\6\11\14\1\4\11\1\1\7\11"+
+    "\1\0\1\11\3\0\1\11\1\0\1\11\3\0\6\11"+
+    "\1\0\1\11\2\0\1\11\1\0\1\11\3\0\1\11"+
+    "\1\0\1\11\1\0\1\11\11\0\1\11\5\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[89];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+	int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public MpdlNormalizerLexAll(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public MpdlNormalizerLexAll(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 172) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      if (zzMarkedPosL > zzStartRead) {
+        switch (zzBufferL[zzMarkedPosL-1]) {
+        case '\n':
+        case '\u000B':
+        case '\u000C':
+        case '\u0085':
+        case '\u2028':
+        case '\u2029':
+          zzAtBOL = true;
+          break;
+        case '\r':
+          if (zzMarkedPosL < zzEndReadL)
+            zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          else if (zzAtEOF)
+            zzAtBOL = false;
+          else {
+            boolean eof = zzRefill();
+            zzMarkedPosL = zzMarkedPos;
+            zzEndReadL = zzEndRead;
+            zzBufferL = zzBuffer;
+            if (eof)
+              zzAtBOL = false;
+            else
+              zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+          }
+          break;
+        default:
+          zzAtBOL = false;
+        }
+      }
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      if (zzAtBOL)
+        zzState = ZZ_LEXSTATE[zzLexicalState+1];
+      else
+        zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 25:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { return "o";
+          }
+        case 39: break;
+        case 22:
+          { cv = 2; return "ii";
+          }
+        case 40: break;
+        case 35:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { return "od";
+          }
+        case 41: break;
+        case 7:
+          { cv = 1; return "s";
+          }
+        case 42: break;
+        case 24:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { return "e";
+          }
+        case 43: break;
+        case 29:
+          { cv = 1; return "Qu";
+          }
+        case 44: break;
+        case 19:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { switch(cv) {
+									case 2: return "v";
+									default: cv = 2; return "u";
+								}
+          }
+        case 45: break;
+        case 9:
+          { cv = 2; return "ae";
+          }
+        case 46: break;
+        case 15:
+          { return "精";
+          }
+        case 47: break;
+        case 3:
+          { cv = 0; return yytext();
+          }
+        case 48: break;
+        case 27:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { switch(cv) {
+									case 2: return "V";
+									default: cv = 2; return "U";
+								}
+          }
+        case 49: break;
+        case 2:
+          { return "";
+          }
+        case 50: break;
+        case 33:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { return "am";
+          }
+        case 51: break;
+        case 18:
+          { cv = 1; return "qu";
+          }
+        case 52: break;
+        case 14:
+          { return "歷";
+          }
+        case 53: break;
+        case 8:
+          { cv = 1; return "ss";
+          }
+        case 54: break;
+        case 4:
+          { cv = 2; return yytext();
+          }
+        case 55: break;
+        case 32:
+          { return "庶";
+          }
+        case 56: break;
+        case 6:
+          { cv = 0; return "";
+          }
+        case 57: break;
+        case 16:
+          { switch(cv) {
+									case 1: return yytext().replace("v", "u");
+									default: cv = 1; return yytext();
+								}
+          }
+        case 58: break;
+        case 12:
+          { return "奇";
+          }
+        case 59: break;
+        case 38:
+          { return "hic";
+          }
+        case 60: break;
+        case 26:
+          { cv = 2; return "oi";
+          }
+        case 61: break;
+        case 36:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { return "um";
+          }
+        case 62: break;
+        case 17:
+          { switch(cv) {
+									case 1: return yytext().replace("V", "U");
+									default: cv = 1; return yytext();
+								}
+          }
+        case 63: break;
+        case 21:
+          { cv = 2; return "uu";
+          }
+        case 64: break;
+        case 31:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = 1; return "U";
+          }
+        case 65: break;
+        case 1:
+          { return yytext();
+          }
+        case 66: break;
+        case 34:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { return "as";
+          }
+        case 67: break;
+        case 23:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { return "a";
+          }
+        case 68: break;
+        case 13:
+          { return "時";
+          }
+        case 69: break;
+        case 10:
+          { cv = 2; return "AE";
+          }
+        case 70: break;
+        case 37:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { return "us";
+          }
+        case 71: break;
+        case 5:
+          { cv = 1; return yytext();
+          }
+        case 72: break;
+        case 28:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { cv = 1; return "u";
+          }
+        case 73: break;
+        case 30:
+          { cv = 1; return "QU";
+          }
+        case 74: break;
+        case 20:
+          { cv = 2; return "ui";
+          }
+        case 75: break;
+        case 11:
+          { cv = 2; return "oe";
+          }
+        case 76: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,143 @@
+/*
+ * Normalization rules for all languages
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * 2011-01-25
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAll
+%type java.lang.String
+%unicode
+// %debug
+
+%states LA, ZH
+
+%{
+	int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+%}
+
+VOWEL=[AEIOUaeiouÆæęàèòùœ]
+CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR=[lLrR]
+QUE=(que)?
+END=\n
+
+%%
+
+<LA> {
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ				{ cv = 1; return "s"; }
+ß				{ cv = 1; return "ss"; }
+[æę]			{ cv = 2; return "ae"; }
+Æ				{ cv = 2; return "AE"; }
+œ				{ cv = 2; return "oe"; }
+// 1.2 character combinations
+ij				{ cv = 2; return "ii"; }
+
+// 2. diacritics
+
+// 2.1 superfluous diacritics in single words
+^ hîc {END} 			{ return "hic"; }
+
+// 2.2 superfluous diacritics at the end of a word
+// 2.2.1 common cases
+à / {QUE} {END}			{ return "a"; }
+àm / {QUE} {END}	{ return "am"; }
+às / {QUE} {END}		{ return "as"; }  // (-àsque will likely never occur)
+// à / [ms]? {QUE} {END}		{ return "a"; }
+è / {QUE} {END}			{ return "e"; }
+ò / {QUE} {END}			{ return "o"; }
+òd / {QUE} {END}		{ return "od"; }
+ùm / {QUE} {END}		{ return "um"; }
+ùs / {QUE} {END}		{ return "us"; }
+
+// 2.3 superfluous diacritics within a word
+// 2.3.1 common cases
+aë				{ cv = 2; return "ae"; }
+oë				{ cv = 2; return "oe"; }
+// 2.3.2 rare cases
+oï				{ cv = 2; return "oi"; }
+uï				{ cv = 2; return "ui"; }
+// 2.3.3 extremely rare cases
+uü			{ cv = 2; return "uu"; }
+
+// 3. rules for u and v
+
+// 3.1 rules for u
+
+u/{VOWEL} 		{
+								switch(cv) {
+									case 2: return "v";
+									default: cv = 2; return "u";
+								}
+							}
+U/{VOWEL}		{
+								switch(cv) {
+									case 2: return "V";
+									default: cv = 2; return "U";
+								}
+							}
+
+// 3.2 rules for v
+
+qv			{ cv = 1; return "qu"; }  // the replaced v still counts as consonant
+Qv		{ cv = 1; return "Qu"; }
+QV		{ cv = 1; return "QU"; }
+
+{LR}v					{
+								switch(cv) {
+									case 1: return yytext().replace("v", "u");
+									default: cv = 1; return yytext();
+								}
+							}
+{LR}V					{
+								switch(cv) {
+									case 1: return yytext().replace("V", "U");
+									default: cv = 1; return yytext();
+								}
+							}
+
+v/{CONS}			{ cv = 1; return "u"; }
+V/{CONS}			{ cv = 1; return "U"; }
+
+
+// default
+
+{VOWEL}		{ cv = 2; return yytext(); }
+{CONS}			{ cv = 1; return yytext(); }
+\n					{ cv = 0; return ""; }
+.					{ cv = 0; return yytext(); }
+
+}
+
+<ZH> {
+
+// Codepoint < FFFF
+
+竒	{ return "奇"; }  // 7AD2 --> 5947
+旹	{ return "時"; }  // 65F9 --> 6642
+歴	{ return "歷"; }  // 6B74 --> 6B77
+精	{ return "精"; }  // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+
+// Codepoint > FFFF
+
+庶	{ return "庶"; }  // 2F88D --> 5EB6  (2F88D is a compatibility ideograph)
+
+
+}
+
+
+// default (can be overridden by individual languages)
+
+\n					{ return ""; }
+.					{ return yytext(); }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -15,6 +15,7 @@
   private static int MARK_SIZE = MARK.length();
   private static int ELEMENT_TYPE_CHARACTERS = 1;
   private static int ELEMENT_TYPE_COMPLEX = 2;
+  private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
   private String xmlnsString = "";
   private String language;
   private String outputXmlFragment = "";
@@ -176,6 +177,17 @@
       return isComplex;
     }

+    /**
+     * feel free to add/remove some element names
+     * @return true if element is a word delimiter element else false
+     */
+    private boolean isWordDelimiterElement() {
+      boolean isWordDelimiterElement = true;
+      if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor"))
+        isWordDelimiterElement = false;
+      return isWordDelimiterElement;
+    }
+
     private String toXmlString() throws SAXException {
       String retString = "";
       String elemLanguage = language;  // default value for the document/page
@@ -200,15 +212,20 @@
             if (! composite.isComplex()) {
               if (composite.value != null && ! composite.value.equals("")) {
                 String compositeValueStr = composite.value;
-                compositesChars += compositeValueStr;
-                compositesCharsWithMarks += compositeValueStr;
+                compositesChars = compositesChars + compositeValueStr;
+                compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
               }
             } else {
+              if (! composite.isWordDelimiterElement()) {
+                compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL;  // add a special symbol at the position of the "not word delimiter element" (e.g. line break)
+              }
               complexElements.add(composite);
               compositesCharsWithMarks += MARK;
             }
           }
           String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage);
+          compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
+          compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
           if (complexElements.size() > 0) {
             for (int i=0; i<complexElements.size(); i++) {
               int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK);
@@ -251,7 +268,8 @@
           String wordTokenText = wordToken.termText();
           LexHandler lexHandler = LexHandler.getInstance();
           // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
-          ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false);
+          String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText);
+          ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false);
           if (lexEntryKeys != null) {
             String lexForms = "";
             for (int j=0; j<lexEntryKeys.size(); j++) {
@@ -259,7 +277,7 @@
               lexForms = lexForms + lexEntryKey + " ";
             }
             lexForms = lexForms.substring(0, lexForms.length() - 1);
-            retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenText + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>";
+            retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenTextWithoutSpecialSymbols + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>";
           } else {
             retStr = retStr + beforeStrDeresolved + wordStrDeresolved;
           }
@@ -272,5 +290,12 @@
       }
       return retStr;
     }
+
+    private String removeSpecialSymbols(String inputStr) {
+      String retStr = inputStr.replaceAll(" ", "");
+      retStr = retStr.replaceAll("\n", "");
+      retStr = retStr.replaceAll("-", "");
+      return retStr;
+    }
   }
 }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -102,6 +102,7 @@
     String retStr = "";
     try {
       MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language);
+      tokenizerAnalyzer.setRegWithoutSemicolon(true);  // hack: feel free to remove it later
       ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr);
       int endPos = 0;
       for (int i=0; i < wordTokens.size(); i++) {
@@ -111,10 +112,9 @@
         String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
         endPos = wordToken.endOffset();
         String wordStr = charactersStr.substring(startPos, endPos);
-
         MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
+        mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS);
         String normalizedWordStr = mpdlNormalizer.normalize(wordStr);
-
         String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr);
         // String wordTokenText = wordToken.termText();
         retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/regularization/RegularizationManager.java	Tue Feb 08 14:54:09 2011 +0100
@@ -55,7 +55,9 @@
     // instance.writeAllRegs();

     ArrayList<Regularization> regs = instance.findRegsByNorm("la", "Illiusque");
+    ArrayList<Regularization> regs2 = instance.findRegsByNorm("la", "Itaque");
     Regularization bla = regs.get(0);
+    Regularization bla2 = regs2.get(0);

     instance.end();
     instance.endOperation();
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Language.java	Tue Feb 08 14:54:09 2011 +0100
@@ -26,6 +26,8 @@
     languageIds.put("lat", "la");
     languageIds.put("nl", "nl");
     languageIds.put("zh", "zh");
+    languageIds.put("zho", "zh");
+    languageIds.put("zho-Hant", "zh");
   }

   public String getLanguageId(String language) {
@@ -35,4 +37,25 @@
     retLanguageId = languageIds.get(language);
     return retLanguageId;
   }
+
+  public boolean isLatin(String language) {
+    if (getLanguageId(language).equals("la"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isItalian(String language) {
+    if (getLanguageId(language).equals("it"))
+      return true;
+    else
+      return false;
+  }
+
+  public boolean isChinese(String language) {
+    if (getLanguageId(language).equals("zh"))
+      return true;
+    else
+      return false;
+  }
 }
\ No newline at end of file
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Transcoder.java	Tue Feb 08 14:54:09 2011 +0100
@@ -62,7 +62,42 @@
     return encodedUnicodeStr;
     */
   }
+

+  public String transcodeFromUnicode2BetaCode(String inputStr) throws ApplicationException {
+    StringReader strReader = new StringReader(inputStr);
+    Unicode2BetacodeLex betacode2UnicodeLex = new Unicode2BetacodeLex(strReader);
+    String retStr = "";
+    String token = "";
+    while (token != null) {
+      try {
+        token = betacode2UnicodeLex.yylex();
+        if (token != null)
+          retStr += token;
+      } catch (IOException e ) {
+        throw new ApplicationException(e);
+      }
+    }
+    return retStr;
+  }
+
+  public String transcodeFromUnicode2Buckwalter(String inputStr) throws ApplicationException {
+    StringReader strReader = new StringReader(inputStr);
+    Unicode2BuckwalterLex betacode2UnicodeLex = new Unicode2BuckwalterLex(strReader);
+    String retStr = "";
+    String token = "";
+    while (token != null) {
+      try {
+        token = betacode2UnicodeLex.yylex();
+        if (token != null)
+          retStr += token;
+      } catch (IOException e ) {
+        throw new ApplicationException(e);
+      }
+    }
+    return retStr;
+  }
+
   public String transcodeFromBuckwalter2Unicode(String inputStr) throws ApplicationException {
     StringReader strReader = new StringReader(inputStr);
     Buckwalter2UnicodeLex buckwalter2UnicodeLex = new Buckwalter2UnicodeLex(strReader);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Betacode.lex	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,319 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+%%
+
+%class Unicode2BetacodeLex
+%public
+%type java.lang.String
+%unicode
+%%
+
+
+"<"[^>]\u+">" { return yytext(); }
+
+"H"    { return "*j"; }
+"h"     { return "j"; }
+"F"    { return "*v"; }
+"f"    { return "v"; }
+"\u03a3"    { return "*s"; }
+
+"." 	{ return "!"; }
+"\u00B7"   { return ":"; }  /* MPDL update  */
+
+"\u1F00"	{ return "a)"; }
+"\u1F01"	{ return "a("; }
+"\u1F02"	{ return "a)\\"; }
+"\u1F03"	{ return "a(\\"; }
+"\u1F04"	{ return "a)/"; }
+"\u1F05"	{ return "a(/"; }
+"\u1F06"	{ return "a)="; }
+"\u1F07"	{ return "a(="; }
+"\u1F08"	{ return "*)a"; }
+"\u1F09"	{ return "*(a"; }
+"\u1F0A"	{ return "*)\\a"; }
+"\u1F0B"	{ return "*(\\a"; }
+"\u1F0C"	{ return "*)/a"; }
+"\u1F0D"	{ return "*(/a"; }
+"\u1F0E"	{ return "*)=a"; }
+"\u1F0F"	{ return "*(=a"; }
+"\u1F10"	{ return "e)"; }
+"\u1F11"	{ return "e("; }
+"\u1F12"	{ return "e)\\"; }
+"\u1F13"	{ return "e(\\"; }
+"\u1F14"	{ return "e)/"; }
+"\u1F15"	{ return "e(/"; }
+"\u1F18"	{ return "*)e"; }
+"\u1F19"	{ return "*(e"; }
+"\u1F1A"	{ return "*)\\e"; }
+"\u1F1B"	{ return "*(\\e"; }
+"\u1F1C"	{ return "*)/e"; }
+"\u1F1D"	{ return "*(/e"; }
+"\u1F20"	{ return "h)"; }
+"\u1F21"	{ return "h("; }
+"\u1F22"	{ return "h)\\"; }
+"\u1F23"	{ return "h(\\"; }
+"\u1F24"	{ return "h)/"; }
+"\u1F25"	{ return "h(/"; }
+"\u1F26"	{ return "h)="; }
+"\u1F27"	{ return "h(="; }
+"\u1F28"	{ return "*)h"; }
+"\u1F29"	{ return "*(h"; }
+"\u1F2A"	{ return "*)\\h"; }
+"\u1F2B"	{ return "*(\\h"; }
+"\u1F2C"	{ return "*)/h"; }
+"\u1F2D"	{ return "*(/h"; }
+"\u1F2E"	{ return "*)=h"; }
+"\u1F2F"	{ return "*(=h"; }
+"\u1F30"	{ return "i)"; }
+"\u1F31"	{ return "i("; }
+"\u1F32"	{ return "i)\\"; }
+"\u1F33"	{ return "i(\\"; }
+"\u1F34"	{ return "i)/"; }
+"\u1F35"	{ return "i(/"; }
+"\u1F36"	{ return "i)="; }
+"\u1F37"	{ return "i(="; }
+"\u1F38"	{ return "*)i"; }
+"\u1F39"	{ return "*(i"; }
+"\u1F3A"	{ return "*)\\i"; }
+"\u1F3B"	{ return "*(\\i"; }
+"\u1F3C"	{ return "*)/i"; }
+"\u1F3D"	{ return "*(/i"; }
+"\u1F3E"	{ return "*)=i"; }
+"\u1F3F"	{ return "*(=i"; }
+"\u1F40"	{ return "o)"; }
+"\u1F41"	{ return "o("; }
+"\u1F42"	{ return "o)\\"; }
+"\u1F43"	{ return "o(\\"; }
+"\u1F44"	{ return "o)/"; }
+"\u1F45"	{ return "o(/"; }
+"\u1F48"	{ return "*)o"; }
+"\u1F49"	{ return "*(o"; }
+"\u1F4A"	{ return "*)\\o"; }
+"\u1F4B"	{ return "*(\\o"; }
+"\u1F4C"	{ return "*)/o"; }
+"\u1F4D"	{ return "*(/o"; }
+"\u1F50"	{ return "u)"; }
+"\u1F51"	{ return "u("; }
+"\u1F52"	{ return "u)\\"; }
+"\u1F53"	{ return "u(\\"; }
+"\u1F54"	{ return "u)/"; }
+"\u1F55"	{ return "u(/"; }
+"\u1F56"	{ return "u)="; }
+"\u1F57"	{ return "u(="; }
+"\u1F59"	{ return "*(u"; }
+"\u1F5B"	{ return "*(\\u"; }
+"\u1F5D"	{ return "*(/u"; }
+"\u1F5F"	{ return "*(=u"; }
+"\u1F60"	{ return "w)"; }
+"\u1F61"	{ return "w("; }
+"\u1F62"	{ return "w)\\"; }
+"\u1F63"	{ return "w(\\"; }
+"\u1F64"	{ return "w)/"; }
+"\u1F65"	{ return "w(/"; }
+"\u1F66"	{ return "w)="; }
+"\u1F67"	{ return "w(="; }
+"\u1F68"	{ return "*)w"; }
+"\u1F69"	{ return "*(w"; }
+"\u1F6A"	{ return "*)\\w"; }
+"\u1F6B"	{ return "*(\\w"; }
+"\u1F6C"	{ return "*)/w"; }
+"\u1F6D"	{ return "*(/w"; }
+"\u1F6E"	{ return "*)=w"; }
+"\u1F6F"	{ return "*(=w"; }
+"\u1F70"	{ return "a\\"; }
+"\u1F71"	{ return "a/"; }
+"\u1F72"	{ return "e\\"; }
+"\u1F73"	{ return "e/"; }
+"\u1F74"	{ return "h\\"; }
+"\u1F75"	{ return "h/"; }
+"\u1F76"	{ return "i\\"; }
+"\u1F77"	{ return "i/"; }
+"\u1F78"	{ return "o\\"; }
+"\u1F79"	{ return "o/"; }
+"\u1F7A"	{ return "u\\"; }
+"\u1F7B"	{ return "u/"; }
+"\u1F7C"	{ return "w\\"; }
+"\u1F7D"	{ return "w/"; }
+"\u1F80"	{ return "a)|"; }
+"\u1F81"	{ return "a(|"; }
+"\u1F82"	{ return "a)\\|"; }
+"\u1F83"	{ return "a(\\|"; }
+"\u1F84"	{ return "a)/|"; }
+"\u1F85"	{ return "a(/|"; }
+"\u1F86"	{ return "a)=|"; }
+"\u1F87"	{ return "a(=|"; }
+"\u1F88"	{ return "*)|a"; }
+"\u1F89"	{ return "*(|a"; }
+"\u1F8A"	{ return "*)\\|a"; }
+"\u1F8B"	{ return "*(\\|a"; }
+"\u1F8C"	{ return "*)/|a"; }
+"\u1F8D"	{ return "*(/|a"; }
+"\u1F8E"	{ return "*)=|a"; }
+"\u1F8F"	{ return "*(=|a"; }
+"\u1F90"	{ return "h)|"; }
+"\u1F91"	{ return "h(|"; }
+"\u1F92"	{ return "h)\\|"; }
+"\u1F93"	{ return "h(\\|"; }
+"\u1F94"	{ return "h)/|"; }
+"\u1F95"	{ return "h(/|"; }
+"\u1F96"	{ return "h)=|"; }
+"\u1F97"	{ return "h(=|"; }
+"\u1F98"	{ return "*)|h"; }
+"\u1F99"	{ return "*(|h"; }
+"\u1F9A"	{ return "*)\\|h"; }
+"\u1F9B"	{ return "*(\\|h"; }
+"\u1F9C"	{ return "*)/|h"; }
+"\u1F9D"	{ return "*(/|h"; }
+"\u1F9E"	{ return "*)=|h"; }
+"\u1F9F"	{ return "*(=|h"; }
+"\u1FA0"	{ return "w)|"; }
+"\u1FA1"	{ return "w(|"; }
+"\u1FA2"	{ return "w)\\|"; }
+"\u1FA3"	{ return "w(\\|"; }
+"\u1FA4"	{ return "w)/|"; }
+"\u1FA5"	{ return "w(/|"; }
+"\u1FA6"	{ return "w)=|"; }
+"\u1FA7"	{ return "w(=|"; }
+"\u1FA8"	{ return "*)|w"; }
+"\u1FA9"	{ return "*(|w"; }
+"\u1FAA"	{ return "*)\\|w"; }
+"\u1FAB"	{ return "*(\\|w"; }
+"\u1FAC"	{ return "*)/|w"; }
+"\u1FAD"	{ return "*(/|w"; }
+"\u1FAE"	{ return "*)=|w"; }
+"\u1FAF"	{ return "*(=|w"; }
+"\u1FB0"	{ return "a^"; }
+"\u1FB1"	{ return "a_"; }
+"\u1FB2"	{ return "a\\|"; }
+"\u1FB3"	{ return "a|"; }
+"\u1FB4"	{ return "a/|"; }
+"\u1FB6"	{ return "a="; }
+"\u1FB7"	{ return "a=|"; }
+"\u1FB8"	{ return "*a^"; }
+"\u1FB9"	{ return "*a_"; }
+"\u1FBA"	{ return "*a\\"; }
+"\u1FBB"	{ return "*a/"; }
+"\u1FBC"	{ return "*a|"; }
+"\u1FC2"	{ return "h\\|"; }
+"\u1FC3"	{ return "h|"; }
+"\u1FC4"	{ return "h/|"; }
+"\u1FC6"	{ return "h="; }
+"\u1FC7"	{ return "h=|"; }
+"\u1FC8"	{ return "*e\\"; }
+"\u1FC9"	{ return "*e/"; }
+"\u1FCA"	{ return "*h\\"; }
+"\u1FCB"	{ return "*h/"; }
+"\u1FCC"	{ return "*h|"; }
+"\u1FD0"	{ return "i^"; }
+"\u1FD1"	{ return "i_"; }
+"\u1FD2"	{ return "i+\\"; }
+"\u1FD3"	{ return "i+/"; }
+"\u1FD6"	{ return "i="; }
+"\u1FD7"	{ return "i+="; }
+"\u1FD8"	{ return "*i^"; }
+"\u1FD9"	{ return "*i_"; }
+"\u1FDA"	{ return "*i\\"; }
+"\u1FDB"	{ return "*i/"; }
+"\u1FE0"	{ return "u^"; }
+"\u1FE1"	{ return "u_"; }
+"\u1FE2"	{ return "u+\\"; }
+"\u1FE3"	{ return "u+/"; }
+"\u1FE4"	{ return "r)"; }
+"\u1FE5"	{ return "r("; }
+"\u1FE6"	{ return "u="; }
+"\u1FE7"	{ return "u+="; }
+"\u1FE8"	{ return "*u^"; }
+"\u1FE9"	{ return "*u_"; }
+"\u1FEA"	{ return "*u\\"; }
+"\u1FEB"	{ return "*u/"; }
+"\u1FEC"	{ return "*(r"; }
+"\u1FF2"	{ return "w\\|"; }
+"\u1FF3"	{ return "w|"; }
+"\u1FF4"	{ return "w/|"; }
+"\u1FFA"	{ return "*w\\"; }
+"\u1FFB"	{ return "*w/"; }
+"\u1FFC"	{ return "*w|"; }
+"\u1FF6"	{ return "w="; }
+"\u1FF7"	{ return "w=|"; }
+"\u1FF8"	{ return "*o\\"; }
+"\u1FF9"	{ return "*o/"; }
+
+"\u0300"	{ return "\\"; }
+"\u0301"		{ return "/"; }
+"\u0304"		{ return "_"; }
+"\u0306"		{ return "^"; }
+"\u0308"		{ return "+"; }
+"\u0302"		{ return "="; }
+"\u0313"		{ return ")"; }
+"\u0314"		{ return "("; }
+"\u0323"		{ return "?"; }
+"\u0345"		{ return "|"; }
+
+"\u03b1"		 { return "a"; }  /* MPDL update  */
+"\u0391"   { return "*a"; }  /* MPDL update  */
+"\u03b2"		{ return "b"; }   /* MPDL update  */
+"\u0392"   { return "*b"; }  /* MPDL update  */
+"\u03b3"		{ return "g"; }   /* MPDL update  */
+"\u0393"   { return "*g"; }  /* MPDL update  */
+"\u03b4"		{ return "d"; }   /* MPDL update  */
+"\u0394"   { return "*d"; }  /* MPDL update  */
+"\u03b5"		{ return "e"; }   /* MPDL update  */
+"\u0395"   { return "*e"; }  /* MPDL update  */
+"\u03b6"		{ return "z"; }   /* MPDL update  */
+"\u0396"   { return "*z"; }  /* MPDL update  */
+"\u03b7"		{ return "h"; }   /* MPDL update  */
+"\u0397"   { return "*h"; }  /* MPDL update  */
+"\u03b8"		{ return "q"; }   /* MPDL update  */
+"\u0398"   { return "*q"; }  /* MPDL update  */
+"\u03b9"		{ return "i"; }   /* MPDL update  */
+"\u0399"   { return "*i"; }  /* MPDL update  */
+"\u03ba"		{ return "k"; }   /* MPDL update  */
+"\u039a"   { return "*k"; }  /* MPDL update  */
+"\u03bb"		{ return "l"; }   /* MPDL update  */
+"\u039b"   { return "*l"; }  /* MPDL update  */
+"\u03bc"		{ return "m"; }   /* MPDL update  */
+"\u039c"   { return "*m"; }  /* MPDL update  */
+"\u03bd"		{ return "n"; }   /* MPDL update  */
+"\u039d"   { return "*n"; }  /* MPDL update  */
+"\u03be"		{ return "c"; }   /* MPDL update  */
+"\u039e"   { return "*c"; }  /* MPDL update  */
+"\u03bf"		{ return "o"; }   /* MPDL update  */
+"\u039f"   { return "*o"; }  /* MPDL update  */
+"\u03c0"		{ return "p"; }   /* MPDL update  */
+"\u03a0"   { return "*p"; }  /* MPDL update  */
+"\u03c1"		{ return "r"; }   /* MPDL update  */
+"\u03a1"   { return "*r"; }  /* MPDL update  */
+
+"\u03a3"  { return "*s"; }  /* MPDL update  */
+"\u03c3"	{ return "s1"; } /* mdh 2002-01-07 */
+"\u03c2"/\-\-	{ return "s"; }
+"\u03c3"/\&gt; }[a-z\?\!0-9*=\/()\'\-] { return "s"; }  /* MPDL update  */
+"\u03c2"/\&lt; { return "s"; }  /* MPDL update  */
+"\u03c3"/[\[\]][a-z\?\!0-9*=\/()\'\-] { return "s"; }  /* MPDL update  */
+"\u03c2"/\??[^a-z0-9*=\/()\'\-\[\?] { return "s"; }
+"\u03c3"		{ return "s"; }  /* MPDL update  */
+
+"\u03c4"		{ return "t"; }   /* MPDL update  */
+"\u03a4"   { return "*t"; }  /* MPDL update  */
+"\u03c5"		{ return "u"; }   /* MPDL update  */
+"\u03a5"   { return "*u"; }  /* MPDL update  */
+"\u03c6"		{ return "f"; }   /* MPDL update  */
+"\u03a6"   { return "*f"; }  /* MPDL update  */
+"\u03c7"		{ return "x"; }   /* MPDL update  */
+"\u03a7"   { return "*x"; }  /* MPDL update  */
+"\u03c8"		{ return "y"; }   /* MPDL update  */
+"\u03a8"   { return "*y"; }  /* MPDL update  */
+"\u03c9"		{ return "w"; }   /* MPDL update  */
+"\u03a9"   { return "*w"; }  /* MPDL update  */
+
+[\&_]"vert;"   { return "|"; }
+[\&_]"lpar;"   { return "("; }
+[\&_]"rpar;"   { return ")"; }
+[\_\&]"lt;"    { return "&lt;"; }
+[\_\&]"gt;"    { return "&gt;"; }
+"&#039;"       { return "'"; }  /* MPDL update  */
+
+"&"[a-zA-Z]+";"  { return yytext(); }
+
+.       { return yytext(); }
+\n      { return yytext(); }
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BetacodeLex.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,1866 @@
+/* The following code was generated by JFlex 1.4.3 on 14.12.10 15:03 */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 14.12.10 15:03 from the specification file
+ * <tt>/Users/jwillenborg/test/jflex/Unicode2Betacode.lex</tt>
+ */
+public class Unicode2BetacodeLex {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0, 0
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\0\26\0\1\u0118\1\0\1\u0130\2\0\1\u0113\4\u011c\2\0"+
+    "\1\u0112\1\11\1\u011c\1\u0131\2\u011c\1\u0132\5\u011c\1\u0133\1\0\1\u0116"+
+    "\1\1\1\u011c\1\2\1\u011b\1\0\5\u0134\1\6\1\u0134\1\4\22\u0134"+
+    "\1\u011d\1\0\1\u011a\1\0\1\u012a\1\0\1\u012f\3\u0135\1\u012c\1\7"+
+    "\1\u0114\1\5\3\u0135\1\u0119\3\u0135\1\u012e\1\u0135\1\u012d\1\u0135\1\u0115"+
+    "\1\3\1\u012b\4\u0135\2\0\1\u0117\71\0\1\12\u0248\0\1\344\1\345"+
+    "\1\351\1\0\1\346\1\0\1\347\1\0\1\350\12\0\1\352\1\353"+
+    "\16\0\1\354\41\0\1\355\113\0\1\357\1\361\1\363\1\365\1\367"+
+    "\1\371\1\373\1\375\1\377\1\u0101\1\u0103\1\u0105\1\u0107\1\u0109\1\u010b"+
+    "\1\u010d\1\u010f\1\0\1\10\1\u011f\1\u0121\1\u0123\1\u0125\1\u0127\1\u0129"+
+    "\7\0\1\356\1\360\1\362\1\364\1\366\1\370\1\372\1\374\1\376"+
+    "\1\u0100\1\u0102\1\u0104\1\u0106\1\u0108\1\u010a\1\u010c\1\u010e\1\u0111\1\u0110"+
+    "\1\u011e\1\u0120\1\u0122\1\u0124\1\u0126\1\u0128\u1b36\0\1\13\1\14\1\15"+
+    "\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+    "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40\2\0"+
+    "\1\41\1\42\1\43\1\44\1\45\1\46\2\0\1\47\1\50\1\51"+
+    "\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\1\62\1\63"+
+    "\1\64\1\65\1\66\1\67\1\70\1\71\1\72\1\73\1\74\1\75"+
+    "\1\76\1\77\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+
+    "\1\110\1\111\1\112\1\113\1\114\2\0\1\115\1\116\1\117\1\120"+
+    "\1\121\1\122\2\0\1\123\1\124\1\125\1\126\1\127\1\130\1\131"+
+    "\1\132\1\0\1\133\1\0\1\134\1\0\1\135\1\0\1\136\1\137"+
+    "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150\1\151"+
+    "\1\152\1\153\1\154\1\155\1\156\1\157\1\160\1\161\1\162\1\163"+
+    "\1\164\1\165\1\166\1\167\1\170\1\171\1\172\1\173\1\174\2\0"+
+    "\1\175\1\176\1\177\1\200\1\201\1\202\1\203\1\204\1\205\1\206"+
+    "\1\207\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+
+    "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230\1\231\1\232"+
+    "\1\233\1\234\1\235\1\236\1\237\1\240\1\241\1\242\1\243\1\244"+
+    "\1\245\1\246\1\247\1\250\1\251\1\252\1\253\1\254\1\255\1\256"+
+    "\1\257\1\260\1\261\1\0\1\262\1\263\1\264\1\265\1\266\1\267"+
+    "\1\270\5\0\1\271\1\272\1\273\1\0\1\274\1\275\1\276\1\277"+
+    "\1\300\1\301\1\302\3\0\1\303\1\304\1\305\1\306\2\0\1\307"+
+    "\1\310\1\311\1\312\1\313\1\314\4\0\1\315\1\316\1\317\1\320"+
+    "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330\1\331\5\0"+
+    "\1\332\1\333\1\334\1\0\1\340\1\341\1\342\1\343\1\335\1\336"+
+    "\1\337\ue003\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+
+    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+    "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+
+    "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+
+    "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+
+    "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+
+    "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+
+    "\1\100\1\101\1\102\1\103\1\104\1\105\1\106\1\107"+
+    "\1\110\1\111\1\112\1\113\1\114\1\115\1\116\1\117"+
+    "\1\120\1\121\1\122\1\123\1\124\1\125\1\126\1\127"+
+    "\1\130\1\131\1\132\1\133\1\134\1\135\1\136\1\137"+
+    "\1\140\1\141\1\142\1\143\1\144\1\145\1\146\1\147"+
+    "\1\150\1\151\1\152\1\153\1\154\1\155\1\156\1\157"+
+    "\1\160\1\161\1\162\1\163\1\164\1\165\1\166\1\167"+
+    "\1\170\1\171\1\172\1\173\1\174\1\175\1\176\1\177"+
+    "\1\200\1\201\1\202\1\203\1\204\1\205\1\206\1\207"+
+    "\1\210\1\211\1\212\1\213\1\214\1\215\1\216\1\217"+
+    "\1\220\1\221\1\222\1\223\1\224\1\225\1\226\1\227"+
+    "\1\230\1\231\1\232\1\233\1\234\1\235\1\236\1\237"+
+    "\1\240\1\241\1\242\1\243\1\244\1\245\1\246\1\247"+
+    "\1\250\1\251\1\252\1\253\1\254\1\255\1\256\1\257"+
+    "\1\260\1\261\1\262\1\263\1\264\1\265\1\266\1\267"+
+    "\1\270\1\271\1\272\1\273\1\274\1\275\1\276\1\277"+
+    "\1\300\1\301\1\302\1\303\1\304\1\305\1\306\1\307"+
+    "\1\310\1\311\1\312\1\313\1\314\1\315\1\316\1\317"+
+    "\1\320\1\321\1\322\1\323\1\324\1\325\1\326\1\327"+
+    "\1\330\1\331\1\332\1\333\1\334\1\335\1\336\1\337"+
+    "\1\340\1\341\1\342\1\343\1\344\1\345\1\346\1\347"+
+    "\1\350\1\351\1\352\1\353\1\354\1\355\1\356\1\357"+
+    "\1\360\1\361\1\362\1\363\1\364\1\365\1\366\1\367"+
+    "\1\370\1\371\1\372\1\373\1\374\1\375\1\376\1\377"+
+    "\1\u0100\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107"+
+    "\1\u0108\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\2\1"+
+    "\1\u010f\1\u0110\1\u0111\1\u0112\1\u0113\1\u0114\1\u0115\1\u0116"+
+    "\1\u0117\1\u0118\1\u0119\1\u011a\1\1\3\0\1\u011b\1\0"+
+    "\1\u011b\33\0\1\u011c\1\u011d\17\0\1\u011e";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[338];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\u0136\0\u026c\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u03a2"+
+    "\0\u04d8\0\u060e\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136"+
+    "\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0136\0\u0744\0\u087a"+
+    "\0\u09b0\0\u0ae6\0\u0136\0\u0c1c\0\u0d52\0\u0e88\0\u0fbe\0\u10f4"+
+    "\0\u122a\0\u1360\0\u1496\0\u15cc\0\u1702\0\u1838\0\u196e\0\u1aa4"+
+    "\0\u1bda\0\u1d10\0\u1e46\0\u1f7c\0\u20b2\0\u21e8\0\u231e\0\u2454"+
+    "\0\u258a\0\u26c0\0\u27f6\0\u292c\0\u2a62\0\u2b98\0\u2cce\0\u2e04"+
+    "\0\u0136\0\u0136\0\u2f3a\0\u3070\0\u31a6\0\u32dc\0\u3412\0\u3548"+
+    "\0\u367e\0\u37b4\0\u38ea\0\u3a20\0\u3b56\0\u3c8c\0\u3dc2\0\u3ef8"+
+    "\0\u402e\0\u0136";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[338];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\2\1\3\2\2\1\4\1\5\1\6\1\7\1\10"+
+    "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+    "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+
+    "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+
+    "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+
+    "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+
+    "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+
+    "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+
+    "\1\101\1\102\1\103\1\104\1\105\1\106\1\107\1\110"+
+    "\1\111\1\112\1\113\1\114\1\115\1\116\1\117\1\120"+
+    "\1\121\1\122\1\123\1\124\1\125\1\126\1\127\1\130"+
+    "\1\131\1\132\1\133\1\134\1\135\1\136\1\137\1\140"+
+    "\1\141\1\142\1\143\1\144\1\145\1\146\1\147\1\150"+
+    "\1\151\1\152\1\153\1\154\1\155\1\156\1\157\1\160"+
+    "\1\161\1\162\1\163\1\164\1\165\1\166\1\167\1\170"+
+    "\1\171\1\172\1\173\1\174\1\175\1\176\1\177\1\200"+
+    "\1\201\1\202\1\203\1\204\1\205\1\206\1\207\1\210"+
+    "\1\211\1\212\1\213\1\214\1\215\1\216\1\217\1\220"+
+    "\1\221\1\222\1\223\1\224\1\225\1\226\1\227\1\230"+
+    "\1\231\1\232\1\233\1\234\1\235\1\236\1\237\1\240"+
+    "\1\241\1\242\1\243\1\244\1\245\1\246\1\247\1\250"+
+    "\1\251\1\252\1\253\1\254\1\255\1\256\1\257\1\260"+
+    "\1\261\1\262\1\263\1\264\1\265\1\266\1\267\1\270"+
+    "\1\271\1\272\1\273\1\274\1\275\1\276\1\277\1\300"+
+    "\1\301\1\302\1\303\1\304\1\305\1\306\1\307\1\310"+
+    "\1\311\1\312\1\313\1\314\1\315\1\316\1\317\1\320"+
+    "\1\321\1\322\1\323\1\324\1\325\1\326\1\327\1\330"+
+    "\1\331\1\332\1\333\1\334\1\335\1\336\1\337\1\340"+
+    "\1\341\1\342\1\343\1\344\1\345\1\346\1\347\1\350"+
+    "\1\351\1\352\1\353\1\354\1\355\1\356\1\357\1\360"+
+    "\1\361\1\362\1\363\1\364\1\365\1\366\1\367\1\370"+
+    "\1\371\1\372\1\373\1\374\1\375\1\376\1\377\1\u0100"+
+    "\1\u0101\1\u0102\1\u0103\1\u0104\1\u0105\1\u0106\1\u0107\1\u0108"+
+    "\1\u0109\1\u010a\1\u010b\1\u010c\1\u010d\1\u010e\1\u010f\1\u0110"+
+    "\1\u0111\1\2\1\u0112\12\2\1\u0113\1\u0114\1\u0115\1\u0116"+
+    "\1\u0117\1\u0118\1\u0119\1\u011a\1\u011b\1\u011c\1\u011d\1\u011e"+
+    "\1\u011f\13\2\u0136\0\2\u0120\1\0\u0133\u0120\u0113\0\1\u0121"+
+    "\6\0\1\u0122\2\0\1\u0122\30\0\3\u0123\1\0\1\u0123"+
+    "\1\0\1\u0123\1\0\u010a\u0123\1\u0124\1\u0125\2\0\3\u0123"+
+    "\1\0\1\u0123\1\u0126\2\0\15\u0123\5\0\1\u0123\3\0"+
+    "\1\u0123\4\0\5\u0127\u010c\0\1\u0128\1\u0127\3\0\1\u0129"+
+    "\21\0\1\u012a\1\u0127\1\u012b\2\u0127\1\u012c\3\0\2\u0127"+
+    "\u0114\0\1\u012d\4\0\1\u012e\21\0\1\u012f\1\0\1\u0130"+
+    "\13\0\1\u0131\u0246\0\1\u0132\44\0\1\u0123\1\0\1\u0123"+
+    "\1\0\1\u0123\u010a\0\1\u0123\1\0\2\u0123\2\0\2\u0123"+
+    "\1\0\2\u0123\16\0\5\u0123\1\0\3\u0123\1\0\1\u0123"+
+    "\u0112\0\1\u0123\u013c\0\1\u0133\34\0\3\u0123\1\0\1\u0123"+
+    "\1\0\1\u0123\1\0\u010a\u0123\1\0\1\u0123\2\0\3\u0123"+
+    "\1\0\1\u0123\3\0\15\u0123\5\0\1\u0123\3\0\1\u0123"+
+    "\4\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+
+    "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u0134"+
+    "\1\2\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0"+
+    "\5\u0127\u010c\0\1\u0127\1\u0135\1\2\2\0\1\u0127\21\0"+
+    "\3\u0127\1\u0136\1\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+
+    "\2\u0127\1\2\2\0\1\u0127\21\0\1\u0127\1\u0137\3\u0127"+
+    "\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0"+
+    "\1\u0127\21\0\3\u0127\1\u0138\1\u0127\4\0\2\u0127\u0131\0"+
+    "\1\u0139\u0119\0\1\u013a\u0135\0\1\u013b\30\0\1\u013c\u0133\0"+
+    "\1\u013d\u0137\0\1\u013e\11\0\1\2\1\u0131\u0247\0\1\u013f"+
+    "\u0135\0\1\u0140\43\0\5\u0127\u010c\0\2\u0127\1\u0141\2\0"+
+    "\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0"+
+    "\2\u0127\1\u0142\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+
+    "\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0"+
+    "\4\u0127\1\u0143\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127"+
+    "\1\2\2\0\1\u0127\21\0\2\u0127\1\u0144\2\u0127\4\0"+
+    "\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\2\2\0\1\u0127"+
+    "\21\0\4\u0127\1\u0145\4\0\2\u0127\u0132\0\1\u0146\u0119\0"+
+    "\1\u0141\u0135\0\1\u0142\u014e\0\1\u0147\u0133\0\1\u0148\u0137\0"+
+    "\1\u0149\u011c\0\1\u014a\u0135\0\1\u0123\42\0\5\u0127\u010c\0"+
+    "\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014b\2\u0127"+
+    "\4\0\2\u0127\3\0\5\u0127\u010c\0\1\u0127\1\u014c\1\2"+
+    "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\3\0\5\u0127"+
+    "\u010c\0\2\u0127\1\2\2\0\1\u0127\21\0\2\u0127\1\u014d"+
+    "\2\u0127\4\0\2\u0127\u0133\0\1\u014e\u012f\0\1\u014f\u011d\0"+
+    "\1\u0150\u014d\0\1\u0151\u011f\0\1\u0122\41\0\5\u0127\u010c\0"+
+    "\2\u0127\1\353\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127"+
+    "\3\0\5\u0127\u010c\0\2\u0127\1\355\2\0\1\u0127\21\0"+
+    "\5\u0127\4\0\2\u0127\3\0\5\u0127\u010c\0\2\u0127\1\352"+
+    "\2\0\1\u0127\21\0\5\u0127\4\0\2\u0127\u0116\0\1\u0152"+
+    "\u0135\0\1\353\u0135\0\1\355\u0135\0\1\352\37\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[16740];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\0\1\11\1\1\u010c\11\3\1\14\11\1\1\3\0"+
+    "\1\11\1\0\1\1\33\0\2\11\17\0\1\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[338];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public Unicode2BetacodeLex(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public Unicode2BetacodeLex(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 724) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 266:
+          { return "p";
+          }
+        case 287: break;
+        case 102:
+          { return "*(w";
+          }
+        case 288: break;
+        case 20:
+          { return "*(\\a";
+          }
+        case 289: break;
+        case 21:
+          { return "*)/a";
+          }
+        case 290: break;
+        case 181:
+          { return "*a/";
+          }
+        case 291: break;
+        case 237:
+          { return "*a";
+          }
+        case 292: break;
+        case 260:
+          { return "n";
+          }
+        case 293: break;
+        case 89:
+          { return "*(u";
+          }
+        case 294: break;
+        case 16:
+          { return "a(=";
+          }
+        case 295: break;
+        case 30:
+          { return "e(/";
+          }
+        case 296: break;
+        case 195:
+          { return "i+\\";
+          }
+        case 297: break;
+        case 222:
+          { return "w=";
+          }
+        case 298: break;
+        case 210:
+          { return "u+=";
+          }
+        case 299: break;
+        case 99:
+          { return "w)=";
+          }
+        case 300: break;
+        case 256:
+          { return "l";
+          }
+        case 301: break;
+        case 205:
+          { return "u+\\";
+          }
+        case 302: break;
+        case 23:
+          { return "*)=a";
+          }
+        case 303: break;
+        case 225:
+          { return "*o/";
+          }
+        case 304: break;
+        case 44:
+          { return "h(=";
+          }
+        case 305: break;
+        case 3:
+          { return "j";
+          }
+        case 306: break;
+        case 103:
+          { return "*)\\w";
+          }
+        case 307: break;
+        case 152:
+          { return "*(/|h";
+          }
+        case 308: break;
+        case 165:
+          { return "*)\\|w";
+          }
+        case 309: break;
+        case 248:
+          { return "h";
+          }
+        case 310: break;
+        case 76:
+          { return "*(o";
+          }
+        case 311: break;
+        case 159:
+          { return "w)/|";
+          }
+        case 312: break;
+        case 178:
+          { return "*a^";
+          }
+        case 313: break;
+        case 141:
+          { return "h)\\|";
+          }
+        case 314: break;
+        case 106:
+          { return "*(/w";
+          }
+        case 315: break;
+        case 275:
+          { return "f";
+          }
+        case 316: break;
+        case 227:
+          { return "/";
+          }
+        case 317: break;
+        case 91:
+          { return "*(/u";
+          }
+        case 318: break;
+        case 242:
+          { return "d";
+          }
+        case 319: break;
+        case 161:
+          { return "w)=|";
+          }
+        case 320: break;
+        case 57:
+          { return "i)/";
+          }
+        case 321: break;
+        case 154:
+          { return "*(=|h";
+          }
+        case 322: break;
+        case 95:
+          { return "w)\\";
+          }
+        case 323: break;
+        case 108:
+          { return "*(=w";
+          }
+        case 324: break;
+        case 116:
+          { return "i/";
+          }
+        case 325: break;
+        case 238:
+          { return "b";
+          }
+        case 326: break;
+        case 207:
+          { return "r)";
+          }
+        case 327: break;
+        case 147:
+          { return "*)|h";
+          }
+        case 328: break;
+        case 62:
+          { return "*(i";
+          }
+        case 329: break;
+        case 230:
+          { return "+";
+          }
+        case 330: break;
+        case 77:
+          { return "*)\\o";
+          }
+        case 331: break;
+        case 166:
+          { return "*(\\|w";
+          }
+        case 332: break;
+        case 71:
+          { return "o)\\";
+          }
+        case 333: break;
+        case 92:
+          { return "*(=u";
+          }
+        case 334: break;
+        case 232:
+          { return ")";
+          }
+        case 335: break;
+        case 14:
+          { return "a(/";
+          }
+        case 336: break;
+        case 122:
+          { return "w/";
+          }
+        case 337: break;
+        case 206:
+          { return "u+/";
+          }
+        case 338: break;
+        case 80:
+          { return "*(/o";
+          }
+        case 339: break;
+        case 97:
+          { return "w)/";
+          }
+        case 340: break;
+        case 123:
+          { return "a)|";
+          }
+        case 341: break;
+        case 229:
+          { return "^";
+          }
+        case 342: break;
+        case 32:
+          { return "*(e";
+          }
+        case 343: break;
+        case 286:
+          { return "'";
+          }
+        case 344: break;
+        case 42:
+          { return "h(/";
+          }
+        case 345: break;
+        case 53:
+          { return "i)";
+          }
+        case 346: break;
+        case 174:
+          { return "a|";
+          }
+        case 347: break;
+        case 63:
+          { return "*)\\i";
+          }
+        case 348: break;
+        case 139:
+          { return "h)|";
+          }
+        case 349: break;
+        case 193:
+          { return "i^";
+          }
+        case 350: break;
+        case 18:
+          { return "*(a";
+          }
+        case 351: break;
+        case 74:
+          { return "o(/";
+          }
+        case 352: break;
+        case 93:
+          { return "w)";
+          }
+        case 353: break;
+        case 66:
+          { return "*(/i";
+          }
+        case 354: break;
+        case 101:
+          { return "*)w";
+          }
+        case 355: break;
+        case 7:
+          { return "!";
+          }
+        case 356: break;
+        case 33:
+          { return "*)\\e";
+          }
+        case 357: break;
+        case 15:
+          { return "a)=";
+          }
+        case 358: break;
+        case 29:
+          { return "e)/";
+          }
+        case 359: break;
+        case 68:
+          { return "*(=i";
+          }
+        case 360: break;
+        case 125:
+          { return "a)\\|";
+          }
+        case 361: break;
+        case 36:
+          { return "*(/e";
+          }
+        case 362: break;
+        case 115:
+          { return "i\\";
+          }
+        case 363: break;
+        case 201:
+          { return "*i\\";
+          }
+        case 364: break;
+        case 112:
+          { return "e/";
+          }
+        case 365: break;
+        case 218:
+          { return "w/|";
+          }
+        case 366: break;
+        case 176:
+          { return "a=";
+          }
+        case 367: break;
+        case 19:
+          { return "*)\\a";
+          }
+        case 368: break;
+        case 43:
+          { return "h)=";
+          }
+        case 369: break;
+        case 133:
+          { return "*)\\|a";
+          }
+        case 370: break;
+        case 270:
+          { return "s1";
+          }
+        case 371: break;
+        case 247:
+          { return "*z";
+          }
+        case 372: break;
+        case 204:
+          { return "u_";
+          }
+        case 373: break;
+        case 143:
+          { return "h)/|";
+          }
+        case 374: break;
+        case 22:
+          { return "*(/a";
+          }
+        case 375: break;
+        case 82:
+          { return "u(";
+          }
+        case 376: break;
+        case 75:
+          { return "*)o";
+          }
+        case 377: break;
+        case 223:
+          { return "w=|";
+          }
+        case 378: break;
+        case 278:
+          { return "*x";
+          }
+        case 379: break;
+        case 121:
+          { return "w\\";
+          }
+        case 380: break;
+        case 200:
+          { return "*i_";
+          }
+        case 381: break;
+        case 219:
+          { return "*w\\";
+          }
+        case 382: break;
+        case 25:
+          { return "e)";
+          }
+        case 383: break;
+        case 145:
+          { return "h)=|";
+          }
+        case 384: break;
+        case 151:
+          { return "*)/|h";
+          }
+        case 385: break;
+        case 24:
+          { return "*(=a";
+          }
+        case 386: break;
+        case 4:
+          { return "*v";
+          }
+        case 387: break;
+        case 192:
+          { return "*h|";
+          }
+        case 388: break;
+        case 39:
+          { return "h)\\";
+          }
+        case 389: break;
+        case 272:
+          { return "*t";
+          }
+        case 390: break;
+        case 134:
+          { return "*(\\|a";
+          }
+        case 391: break;
+        case 214:
+          { return "*u/";
+          }
+        case 392: break;
+        case 61:
+          { return "*)i";
+          }
+        case 393: break;
+        case 269:
+          { return "*r";
+          }
+        case 394: break;
+        case 160:
+          { return "w(/|";
+          }
+        case 395: break;
+        case 13:
+          { return "a)/";
+          }
+        case 396: break;
+        case 153:
+          { return "*)=|h";
+          }
+        case 397: break;
+        case 267:
+          { return "*p";
+          }
+        case 398: break;
+        case 111:
+          { return "e\\";
+          }
+        case 399: break;
+        case 88:
+          { return "u(=";
+          }
+        case 400: break;
+        case 31:
+          { return "*)e";
+          }
+        case 401: break;
+        case 188:
+          { return "*e\\";
+          }
+        case 402: break;
+        case 110:
+          { return "a/";
+          }
+        case 403: break;
+        case 162:
+          { return "w(=|";
+          }
+        case 404: break;
+        case 41:
+          { return "h)/";
+          }
+        case 405: break;
+        case 261:
+          { return "*n";
+          }
+        case 406: break;
+        case 226:
+          { return "\\";
+          }
+        case 407: break;
+        case 96:
+          { return "w(\\";
+          }
+        case 408: break;
+        case 148:
+          { return "*(|h";
+          }
+        case 409: break;
+        case 257:
+          { return "*l";
+          }
+        case 410: break;
+        case 211:
+          { return "*u^";
+          }
+        case 411: break;
+        case 198:
+          { return "i+=";
+          }
+        case 412: break;
+        case 279:
+          { return "y";
+          }
+        case 413: break;
+        case 17:
+          { return "*)a";
+          }
+        case 414: break;
+        case 73:
+          { return "o)/";
+          }
+        case 415: break;
+        case 72:
+          { return "o(\\";
+          }
+        case 416: break;
+        case 118:
+          { return "o/";
+          }
+        case 417: break;
+        case 168:
+          { return "*(/|w";
+          }
+        case 418: break;
+        case 2:
+          { return "*j";
+          }
+        case 419: break;
+        case 281:
+          { return "w";
+          }
+        case 420: break;
+        case 48:
+          { return "*(\\h";
+          }
+        case 421: break;
+        case 49:
+          { return "*)/h";
+          }
+        case 422: break;
+        case 9:
+          { return "a)";
+          }
+        case 423: break;
+        case 216:
+          { return "w\\|";
+          }
+        case 424: break;
+        case 249:
+          { return "*h";
+          }
+        case 425: break;
+        case 273:
+          { return "u";
+          }
+        case 426: break;
+        case 171:
+          { return "a^";
+          }
+        case 427: break;
+        case 175:
+          { return "a/|";
+          }
+        case 428: break;
+        case 285:
+          { return "&lt;";
+          }
+        case 429: break;
+        case 276:
+          { return "*f";
+          }
+        case 430: break;
+        case 38:
+          { return "h(";
+          }
+        case 431: break;
+        case 283:
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { return "s";
+          }
+        case 432: break;
+        case 51:
+          { return "*)=h";
+          }
+        case 433: break;
+        case 127:
+          { return "a)/|";
+          }
+        case 434: break;
+        case 170:
+          { return "*(=|w";
+          }
+        case 435: break;
+        case 69:
+          { return "o)";
+          }
+        case 436: break;
+        case 243:
+          { return "*d";
+          }
+        case 437: break;
+        case 185:
+          { return "h/|";
+          }
+        case 438: break;
+        case 250:
+          { return "q";
+          }
+        case 439: break;
+        case 163:
+          { return "*)|w";
+          }
+        case 440: break;
+        case 8:
+          { return ":";
+          }
+        case 441: break;
+        case 177:
+          { return "a=|";
+          }
+        case 442: break;
+        case 239:
+          { return "*b";
+          }
+        case 443: break;
+        case 158:
+          { return "w(\\|";
+          }
+        case 444: break;
+        case 109:
+          { return "a\\";
+          }
+        case 445: break;
+        case 264:
+          { return "o";
+          }
+        case 446: break;
+        case 129:
+          { return "a)=|";
+          }
+        case 447: break;
+        case 86:
+          { return "u(/";
+          }
+        case 448: break;
+        case 180:
+          { return "*a\\";
+          }
+        case 449: break;
+        case 11:
+          { return "a)\\";
+          }
+        case 450: break;
+        case 187:
+          { return "h=|";
+          }
+        case 451: break;
+        case 258:
+          { return "m";
+          }
+        case 452: break;
+        case 191:
+          { return "*h/";
+          }
+        case 453: break;
+        case 113:
+          { return "h\\";
+          }
+        case 454: break;
+        case 190:
+          { return "*h\\";
+          }
+        case 455: break;
+        case 196:
+          { return "i+/";
+          }
+        case 456: break;
+        case 254:
+          { return "k";
+          }
+        case 457: break;
+        case 215:
+          { return "*(r";
+          }
+        case 458: break;
+        case 27:
+          { return "e)\\";
+          }
+        case 459: break;
+        case 117:
+          { return "o\\";
+          }
+        case 460: break;
+        case 252:
+          { return "i";
+          }
+        case 461: break;
+        case 224:
+          { return "*o\\";
+          }
+        case 462: break;
+        case 144:
+          { return "h(/|";
+          }
+        case 463: break;
+        case 179:
+          { return "*a_";
+          }
+        case 464: break;
+        case 221:
+          { return "*w|";
+          }
+        case 465: break;
+        case 240:
+          { return "g";
+          }
+        case 466: break;
+        case 55:
+          { return "i)\\";
+          }
+        case 467: break;
+        case 209:
+          { return "u=";
+          }
+        case 468: break;
+        case 87:
+          { return "u)=";
+          }
+        case 469: break;
+        case 244:
+          { return "e";
+          }
+        case 470: break;
+        case 146:
+          { return "h(=|";
+          }
+        case 471: break;
+        case 83:
+          { return "u)\\";
+          }
+        case 472: break;
+        case 40:
+          { return "h(\\";
+          }
+        case 473: break;
+        case 262:
+          { return "c";
+          }
+        case 474: break;
+        case 136:
+          { return "*(/|a";
+          }
+        case 475: break;
+        case 236:
+          { return "a";
+          }
+        case 476: break;
+        case 208:
+          { return "r(";
+          }
+        case 477: break;
+        case 46:
+          { return "*(h";
+          }
+        case 478: break;
+        case 228:
+          { return "_";
+          }
+        case 479: break;
+        case 183:
+          { return "h\\|";
+          }
+        case 480: break;
+        case 233:
+          { return "(";
+          }
+        case 481: break;
+        case 138:
+          { return "*(=|a";
+          }
+        case 482: break;
+        case 194:
+          { return "i_";
+          }
+        case 483: break;
+        case 167:
+          { return "*)/|w";
+          }
+        case 484: break;
+        case 54:
+          { return "i(";
+          }
+        case 485: break;
+        case 131:
+          { return "*)|a";
+          }
+        case 486: break;
+        case 47:
+          { return "*)\\h";
+          }
+        case 487: break;
+        case 184:
+          { return "h|";
+          }
+        case 488: break;
+        case 149:
+          { return "*)\\|h";
+          }
+        case 489: break;
+        case 94:
+          { return "w(";
+          }
+        case 490: break;
+        case 50:
+          { return "*(/h";
+          }
+        case 491: break;
+        case 120:
+          { return "u/";
+          }
+        case 492: break;
+        case 85:
+          { return "u)/";
+          }
+        case 493: break;
+        case 169:
+          { return "*)=|w";
+          }
+        case 494: break;
+        case 156:
+          { return "w(|";
+          }
+        case 495: break;
+        case 202:
+          { return "*i/";
+          }
+        case 496: break;
+        case 52:
+          { return "*(=h";
+          }
+        case 497: break;
+        case 128:
+          { return "a(/|";
+          }
+        case 498: break;
+        case 157:
+          { return "w)\\|";
+          }
+        case 499: break;
+        case 60:
+          { return "i(=";
+          }
+        case 500: break;
+        case 164:
+          { return "*(|w";
+          }
+        case 501: break;
+        case 150:
+          { return "*(\\|h";
+          }
+        case 502: break;
+        case 220:
+          { return "*w/";
+          }
+        case 503: break;
+        case 186:
+          { return "h=";
+          }
+        case 504: break;
+        case 81:
+          { return "u)";
+          }
+        case 505: break;
+        case 130:
+          { return "a(=|";
+          }
+        case 506: break;
+        case 280:
+          { return "*y";
+          }
+        case 507: break;
+        case 203:
+          { return "u^";
+          }
+        case 508: break;
+        case 104:
+          { return "*(\\w";
+          }
+        case 509: break;
+        case 12:
+          { return "a(\\";
+          }
+        case 510: break;
+        case 105:
+          { return "*)/w";
+          }
+        case 511: break;
+        case 182:
+          { return "*a|";
+          }
+        case 512: break;
+        case 282:
+          { return "*w";
+          }
+        case 513: break;
+        case 199:
+          { return "*i^";
+          }
+        case 514: break;
+        case 100:
+          { return "w(=";
+          }
+        case 515: break;
+        case 90:
+          { return "*(\\u";
+          }
+        case 516: break;
+        case 26:
+          { return "e(";
+          }
+        case 517: break;
+        case 1:
+          { return yytext();
+          }
+        case 518: break;
+        case 142:
+          { return "h(\\|";
+          }
+        case 519: break;
+        case 274:
+          { return "*u";
+          }
+        case 520: break;
+        case 28:
+          { return "e(\\";
+          }
+        case 521: break;
+        case 107:
+          { return "*)=w";
+          }
+        case 522: break;
+        case 173:
+          { return "a\\|";
+          }
+        case 523: break;
+        case 6:
+          { return "*s";
+          }
+        case 524: break;
+        case 45:
+          { return "*)h";
+          }
+        case 525: break;
+        case 251:
+          { return "*q";
+          }
+        case 526: break;
+        case 119:
+          { return "u\\";
+          }
+        case 527: break;
+        case 56:
+          { return "i(\\";
+          }
+        case 528: break;
+        case 213:
+          { return "*u\\";
+          }
+        case 529: break;
+        case 284:
+          { return "&gt;";
+          }
+        case 530: break;
+        case 78:
+          { return "*(\\o";
+          }
+        case 531: break;
+        case 189:
+          { return "*e/";
+          }
+        case 532: break;
+        case 79:
+          { return "*)/o";
+          }
+        case 533: break;
+        case 265:
+          { return "*o";
+          }
+        case 534: break;
+        case 135:
+          { return "*)/|a";
+          }
+        case 535: break;
+        case 84:
+          { return "u(\\";
+          }
+        case 536: break;
+        case 235:
+          { return "|";
+          }
+        case 537: break;
+        case 58:
+          { return "i(/";
+          }
+        case 538: break;
+        case 259:
+          { return "*m";
+          }
+        case 539: break;
+        case 212:
+          { return "*u_";
+          }
+        case 540: break;
+        case 114:
+          { return "h/";
+          }
+        case 541: break;
+        case 246:
+          { return "z";
+          }
+        case 542: break;
+        case 255:
+          { return "*k";
+          }
+        case 543: break;
+        case 277:
+          { return "x";
+          }
+        case 544: break;
+        case 64:
+          { return "*(\\i";
+          }
+        case 545: break;
+        case 65:
+          { return "*)/i";
+          }
+        case 546: break;
+        case 137:
+          { return "*)=|a";
+          }
+        case 547: break;
+        case 253:
+          { return "*i";
+          }
+        case 548: break;
+        case 98:
+          { return "w(/";
+          }
+        case 549: break;
+        case 5:
+          { return "v";
+          }
+        case 550: break;
+        case 124:
+          { return "a(|";
+          }
+        case 551: break;
+        case 234:
+          { return "?";
+          }
+        case 552: break;
+        case 172:
+          { return "a_";
+          }
+        case 553: break;
+        case 217:
+          { return "w|";
+          }
+        case 554: break;
+        case 10:
+          { return "a(";
+          }
+        case 555: break;
+        case 241:
+          { return "*g";
+          }
+        case 556: break;
+        case 155:
+          { return "w)|";
+          }
+        case 557: break;
+        case 37:
+          { return "h)";
+          }
+        case 558: break;
+        case 271:
+          { return "t";
+          }
+        case 559: break;
+        case 231:
+          { return "=";
+          }
+        case 560: break;
+        case 67:
+          { return "*)=i";
+          }
+        case 561: break;
+        case 34:
+          { return "*(\\e";
+          }
+        case 562: break;
+        case 35:
+          { return "*)/e";
+          }
+        case 563: break;
+        case 140:
+          { return "h(|";
+          }
+        case 564: break;
+        case 132:
+          { return "*(|a";
+          }
+        case 565: break;
+        case 245:
+          { return "*e";
+          }
+        case 566: break;
+        case 268:
+          { return "r";
+          }
+        case 567: break;
+        case 59:
+          { return "i)=";
+          }
+        case 568: break;
+        case 70:
+          { return "o(";
+          }
+        case 569: break;
+        case 126:
+          { return "a(\\|";
+          }
+        case 570: break;
+        case 263:
+          { return "*c";
+          }
+        case 571: break;
+        case 197:
+          { return "i=";
+          }
+        case 572: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,121 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+%%
+%{
+    /*
+     * Betacode to Unicode conversion
+     */
+
+%}
+
+%class Unicode2BuckwalterLex
+%public
+%type java.lang.String
+%unicode
+%%
+
+
+"<"[^>]+">" { return yytext(); }
+
+"\u0621"   { return "'"; }  /* Hamza  */
+"\u0622"   { return "|"; }  /* ALEF WITH MADDA ABOVE  from AraMorph */
+"\u0623"   { return ">"; }  /* Hamza  */
+"\u0624"   { return "&"; }  /* Hamza  */
+"\u0625"   { return "<"; }  /* Alif + HamzaBelow  */
+"\u0626"   { return "}"; }  /* Ya + HamzaAbove  */
+"\u0627"   { return "A"; }  /* Alif  */
+"\u0628"   { return "b"; }  /* Ba  */
+"\u0629"   { return "p"; }  /* TaMarbuta  */
+"\u062A"   { return "t"; }  /* Ta  */
+"\u062B"   { return "v"; }  /* Tha  */
+"\u062C"   { return "j"; }  /* Jeem  */
+"\u062D"   { return "H"; }  /* HHa  */
+"\u062E"   { return "x"; }  /* Kha  */
+"\u062F"   { return "d"; }  /* Dal  */
+"\u0630"   { return "*"; }  /* Thal  */
+"\u0631"   { return "r"; }  /* Ra  */
+"\u0632"   { return "z"; }  /* Zain  */
+"\u0633"   { return "s"; }  /* Seen  */
+"\u0634"   { return "$"; }  /* Sheen  */
+"\u0635"   { return "S"; }  /* Sad  */
+"\u0636"   { return "D"; }  /* DDad  */
+"\u0637"   { return "T"; }  /* TTa  */
+"\u0638"   { return "Z"; }  /* DTha  */
+"\u0639"   { return "E"; }  /* Ain  */
+"\u063A"   { return "g"; }  /* Ghain  */
+
+"\u0640"   { return "_"; }  /* Tatweel  */
+"\u0641"   { return "f"; }  /* Fa  */
+"\u0642"   { return "q"; }  /* Qaf  */
+"\u0643"   { return "k"; }  /* Kaf  */
+"\u0644"   { return "l"; }  /* Lam  */
+"\u0645"   { return "m"; }  /* Meem  */
+"\u0646"   { return "n"; }  /* Noon  */
+"\u0647"   { return "h"; }  /* Ha  */
+"\u0648"   { return "w"; }  /* Waw  */
+"\u0649"   { return "Y"; }  /* AlifMaksura  */
+"\u064A"   { return "y"; }  /* Ya  */
+"\u064B"   { return "F"; }  /* Fathatan  */
+"\u064C"   { return "N"; }  /* Dammatan  */
+"\u064D"   { return "K"; }  /* Kasratan  */
+"\u064E"   { return "a"; }  /* Fatha  */
+"\u064F"   { return "u"; }  /* Damma  */
+"\u0650"   { return "i"; }  /* Kasra  */
+"\u0651"   { return "~"; }  /* Shadda  */
+"\u0652"   { return "o"; }  /* Sukun  */
+"\u0653"   { return "^"; }  /* Maddah  */
+"\u0654"   { return "#"; }  /* HamzaAbove  */
+
+"\u0670"   { return "`"; }  /* AlifKhanjareeya  */
+"\u0671"   { return "{"; }  /* Alif + HamzatWasl  */
+
+"\u067E"   { return "P"; }  /* PEH  from AraMorph   */
+"\u0686"   { return "J"; }  /* TCHEH  from AraMorph */
+"\u06A4"   { return "V"; }  /* VEH  from AraMorph */
+"\u06AF"   { return "G"; }  /* GAF  from AraMorph */
+"\u0698"   { return "R"; }  /* JEH  from AraMorph */
+"\u061F"   { return "?"; }  /* QUESTION MARK  from AraMorph */
+
+"\u06DC"   { return ":"; }  /* SmallHighSeen  */
+"\u06DF"   { return "@"; }  /* SmallHighRoundedZero  */
+
+"\u06E2"   { return "["; }  /* SmallHighMeemIsolatedForm  */
+"\u06E3"   { return ";"; }  /* SmallLowSeen  */
+"\u06E5"   { return ","; }  /* SmallWaw  */
+"\u06E6"   { return "."; }  /* SmallYa  */
+"\u06E8"   { return "!"; }  /* SmallHighNoon  */
+"\u06EA"   { return "-"; }  /* EmptyCentreLowStop  */
+"\u06EB"   { return "+"; }  /* EmptyCentreHighStop  */
+"\u06EC"   { return "%"; }  /* RoundedHighStopWithFilledCentre  */
+"\u06ED"   { return "]"; }  /* SmallLowMeem  */
+
+[\&_]"vert;"   { return "|"; }
+[\&_]"lpar;"   { return "("; }
+[\&_]"rpar;"   { return ")"; }
+[\_\&]"lt;"    { return "&lt;"; }
+[\_\&]"gt;"    { return "&gt;"; }
+"&#039;"       { return "'"; }
+
+"&"[a-zA-Z]+";"  { return yytext(); }
+
+.       { return yytext(); }
+\n      { return yytext(); }
+
+/* make problemes   */
+/* "\u06E0"   { return "\\""; }  SmallHighUprightRectangularZero  */
+
+
+/* double entries    */
+/*  "\u060C"   { return ","; }  COMMA  from AraMorph */
+/*  "\u061B"   { return ";"; }  SEMICOLON  from AraMorph */
+
+/* not in buckwalter contained   */
+/* \u0679 : ARABIC LETTER TTEH   */
+/* \u0688 : ARABIC LETTER DDAL   */
+/* \u06A9 : ARABIC LETTER KEHEH  */
+/* \u0691 : ARABIC LETTER RREH   */
+/* \u06BA : ARABIC LETTER NOON GHUNNA  */
+/* \u06BE : ARABIC LETTER HEH DOACHASHMEE  */
+/* \u06C1 : ARABIC LETTER HEH GOAL  */
+/* \u06D2 : ARABIC LETTER YEH BARREE  */
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2BuckwalterLex.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,882 @@
+/* The following code was generated by JFlex 1.4.3 on 14.12.10 17:12 */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
+ * on 14.12.10 17:12 from the specification file
+ * <tt>/Users/jwillenborg/test/jflex/Unicode2Buckwalter.lex</tt>
+ */
+public class Unicode2BuckwalterLex {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = {
+     0, 0
+  };
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED =
+    "\12\0\1\0\30\0\1\120\2\0\1\117\11\0\1\121\2\0\1\122"+
+    "\5\0\1\123\1\0\1\112\1\1\1\0\1\2\2\0\32\124\4\0"+
+    "\1\105\1\0\1\115\3\124\1\107\1\124\1\116\4\124\1\113\3\124"+
+    "\1\114\1\124\1\110\1\124\1\111\1\124\1\106\4\124\u05a4\0\1\71"+
+    "\1\0\1\3\1\4\1\5\1\6\1\7\1\10\1\11\1\12\1\13"+
+    "\1\14\1\15\1\16\1\17\1\20\1\21\1\22\1\23\1\24\1\25"+
+    "\1\26\1\27\1\30\1\31\1\32\1\33\1\34\5\0\1\35\1\36"+
+    "\1\37\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+
+    "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60\1\61\33\0"+
+    "\1\62\1\63\14\0\1\64\7\0\1\65\21\0\1\70\13\0\1\66"+
+    "\12\0\1\67\54\0\1\72\2\0\1\73\2\0\1\74\1\75\1\0"+
+    "\1\76\1\77\1\0\1\100\1\0\1\101\1\102\1\103\1\104\uf912\0";
+
+  /**
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /**
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+    "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+
+    "\1\20\1\21\1\22\1\23\1\24\1\25\1\26\1\27"+
+    "\1\30\1\31\1\32\1\33\1\34\1\35\1\36\1\37"+
+    "\1\40\1\41\1\42\1\43\1\44\1\45\1\46\1\47"+
+    "\1\50\1\51\1\52\1\53\1\54\1\55\1\56\1\57"+
+    "\1\60\1\61\1\62\1\63\1\64\1\65\1\66\1\67"+
+    "\1\70\1\71\1\72\1\73\1\74\1\75\1\76\1\77"+
+    "\1\100\1\101\1\102\1\103\2\1\30\0\1\104\1\0"+
+    "\1\105\13\0\1\106\1\107";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[111];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /**
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\125\0\252\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\125\0\125\0\125"+
+    "\0\125\0\125\0\125\0\125\0\125\0\377\0\u0154\0\u01a9"+
+    "\0\u01fe\0\u0253\0\u02a8\0\u02fd\0\u0352\0\u03a7\0\u03fc\0\u0451"+
+    "\0\u04a6\0\u04fb\0\u0550\0\u05a5\0\u05fa\0\u064f\0\u06a4\0\u06f9"+
+    "\0\u074e\0\u07a3\0\u07f8\0\u084d\0\u08a2\0\u08f7\0\u094c\0\125"+
+    "\0\u09a1\0\125\0\u09f6\0\u0a4b\0\u0aa0\0\u0af5\0\u0b4a\0\u0b9f"+
+    "\0\u0bf4\0\u0c49\0\u0c9e\0\u0cf3\0\u0d48\0\125\0\125";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[111];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /**
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\2\1\3\1\2\1\4\1\5\1\6\1\7\1\10"+
+    "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+    "\1\21\1\22\1\23\1\24\1\25\1\26\1\27\1\30"+
+    "\1\31\1\32\1\33\1\34\1\35\1\36\1\37\1\40"+
+    "\1\41\1\42\1\43\1\44\1\45\1\46\1\47\1\50"+
+    "\1\51\1\52\1\53\1\54\1\55\1\56\1\57\1\60"+
+    "\1\61\1\62\1\63\1\64\1\65\1\66\1\67\1\70"+
+    "\1\71\1\72\1\73\1\74\1\75\1\76\1\77\1\100"+
+    "\1\101\1\102\1\103\1\104\1\105\1\106\11\2\1\107"+
+    "\5\2\125\0\2\110\1\0\122\110\106\0\1\111\1\0"+
+    "\1\112\2\0\1\113\2\0\1\114\114\0\1\115\1\116"+
+    "\1\117\1\116\1\0\1\120\2\116\1\121\1\0\1\122"+
+    "\3\0\1\116\2\110\1\2\122\110\107\0\1\123\131\0"+
+    "\1\124\121\0\1\125\2\0\1\126\121\0\1\127\121\0"+
+    "\1\116\1\130\2\116\1\2\4\116\5\0\1\116\106\0"+
+    "\4\116\1\2\4\116\5\0\1\116\106\0\4\116\1\2"+
+    "\1\116\1\131\2\116\5\0\1\116\106\0\3\116\1\132"+
+    "\1\2\1\116\1\133\2\116\5\0\1\116\106\0\3\116"+
+    "\1\134\1\2\4\116\5\0\1\116\121\0\1\135\113\0"+
+    "\1\136\131\0\1\137\121\0\1\140\127\0\1\141\121\0"+
+    "\1\142\120\0\2\116\1\143\1\116\1\2\4\116\5\0"+
+    "\1\116\106\0\4\116\1\2\2\116\1\144\1\116\5\0"+
+    "\1\116\106\0\4\116\1\140\4\116\5\0\1\116\106\0"+
+    "\4\116\1\2\2\116\1\145\1\116\5\0\1\116\106\0"+
+    "\4\116\1\142\4\116\5\0\1\116\122\0\1\146\113\0"+
+    "\1\147\123\0\1\150\124\0\1\151\122\0\3\116\1\152"+
+    "\1\2\4\116\5\0\1\116\106\0\2\116\1\153\1\116"+
+    "\1\2\4\116\5\0\1\116\106\0\2\116\1\154\1\116"+
+    "\1\2\4\116\5\0\1\116\123\0\1\155\113\0\1\5"+
+    "\124\0\1\156\124\0\1\157\120\0\4\116\1\5\4\116"+
+    "\5\0\1\116\106\0\4\116\1\156\4\116\5\0\1\116"+
+    "\106\0\4\116\1\157\4\116\5\0\1\116\112\0\1\4"+
+    "\12\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[3485];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\0\1\11\1\1\102\11\2\1\30\0\1\11\1\0"+
+    "\1\11\13\0\2\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[111];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the
+   * matched text
+   */
+  private int yycolumn;
+
+  /**
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+    /*
+     * Betacode to Unicode conversion
+     */
+
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public Unicode2BuckwalterLex(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public Unicode2BuckwalterLex(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /**
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 240) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   *
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the
+   * matched text.
+   *
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch.
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of
+   * yypushback(int) and a match-all fallback rule) this method
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  }
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public java.lang.String yylex() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 23:
+          { return "D";
+          }
+        case 72: break;
+        case 17:
+          { return "*";
+          }
+        case 73: break;
+        case 46:
+          { return "o";
+          }
+        case 74: break;
+        case 60:
+          { return ";";
+          }
+        case 75: break;
+        case 63:
+          { return "!";
+          }
+        case 76: break;
+        case 29:
+          { return "f";
+          }
+        case 77: break;
+        case 36:
+          { return "w";
+          }
+        case 78: break;
+        case 67:
+          { return "]";
+          }
+        case 79: break;
+        case 70:
+          { return ")";
+          }
+        case 80: break;
+        case 69:
+          { return "&gt;";
+          }
+        case 81: break;
+        case 34:
+          { return "n";
+          }
+        case 82: break;
+        case 24:
+          { return "T";
+          }
+        case 83: break;
+        case 57:
+          { return ":";
+          }
+        case 84: break;
+        case 41:
+          { return "K";
+          }
+        case 85: break;
+        case 12:
+          { return "v";
+          }
+        case 86: break;
+        case 71:
+          { return "(";
+          }
+        case 87: break;
+        case 33:
+          { return "m";
+          }
+        case 88: break;
+        case 22:
+          { return "S";
+          }
+        case 89: break;
+        case 45:
+          { return "~";
+          }
+        case 90: break;
+        case 16:
+          { return "d";
+          }
+        case 91: break;
+        case 52:
+          { return "J";
+          }
+        case 92: break;
+        case 43:
+          { return "u";
+          }
+        case 93: break;
+        case 59:
+          { return "[";
+          }
+        case 94: break;
+        case 8:
+          { return "A";
+          }
+        case 95: break;
+        case 2:
+          { return "'";
+          }
+        case 96: break;
+        case 32:
+          { return "l";
+          }
+        case 97: break;
+        case 55:
+          { return "R";
+          }
+        case 98: break;
+        case 7:
+          { return "}";
+          }
+        case 99: break;
+        case 11:
+          { return "t";
+          }
+        case 100: break;
+        case 25:
+          { return "Z";
+          }
+        case 101: break;
+        case 58:
+          { return "@";
+          }
+        case 102: break;
+        case 5:
+          { return "&";
+          }
+        case 103: break;
+        case 31:
+          { return "k";
+          }
+        case 104: break;
+        case 3:
+          { return "|";
+          }
+        case 105: break;
+        case 9:
+          { return "b";
+          }
+        case 106: break;
+        case 14:
+          { return "H";
+          }
+        case 107: break;
+        case 62:
+          { return ".";
+          }
+        case 108: break;
+        case 20:
+          { return "s";
+          }
+        case 109: break;
+        case 37:
+          { return "Y";
+          }
+        case 110: break;
+        case 56:
+          { return "?";
+          }
+        case 111: break;
+        case 66:
+          { return "%";
+          }
+        case 112: break;
+        case 13:
+          { return "j";
+          }
+        case 113: break;
+        case 51:
+          { return "P";
+          }
+        case 114: break;
+        case 50:
+          { return "{";
+          }
+        case 115: break;
+        case 1:
+          { return yytext();
+          }
+        case 116: break;
+        case 42:
+          { return "a";
+          }
+        case 117: break;
+        case 54:
+          { return "G";
+          }
+        case 118: break;
+        case 64:
+          { return "-";
+          }
+        case 119: break;
+        case 18:
+          { return "r";
+          }
+        case 120: break;
+        case 4:
+          { return ">";
+          }
+        case 121: break;
+        case 21:
+          { return "$";
+          }
+        case 122: break;
+        case 44:
+          { return "i";
+          }
+        case 123: break;
+        case 19:
+          { return "z";
+          }
+        case 124: break;
+        case 68:
+          { return "&lt;";
+          }
+        case 125: break;
+        case 49:
+          { return "`";
+          }
+        case 126: break;
+        case 39:
+          { return "F";
+          }
+        case 127: break;
+        case 61:
+          { return ",";
+          }
+        case 128: break;
+        case 30:
+          { return "q";
+          }
+        case 129: break;
+        case 48:
+          { return "#";
+          }
+        case 130: break;
+        case 35:
+          { return "h";
+          }
+        case 131: break;
+        case 40:
+          { return "N";
+          }
+        case 132: break;
+        case 38:
+          { return "y";
+          }
+        case 133: break;
+        case 28:
+          { return "_";
+          }
+        case 134: break;
+        case 26:
+          { return "E";
+          }
+        case 135: break;
+        case 65:
+          { return "+";
+          }
+        case 136: break;
+        case 10:
+          { return "p";
+          }
+        case 137: break;
+        case 53:
+          { return "V";
+          }
+        case 138: break;
+        case 6:
+          { return "<";
+          }
+        case 139: break;
+        case 27:
+          { return "g";
+          }
+        case 140: break;
+        case 15:
+          { return "x";
+          }
+        case 141: break;
+        case 47:
+          { return "^";
+          }
+        case 142: break;
+        default:
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return null;
+          }
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -59,6 +59,8 @@
         if (! hasLexEntry) {
           hasLexEntry = hasLexEntryKey(lName, language);
         }
+        if (language.equals("de") || language.equals("fr") || language.equals("nl"))   // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für frund nl  auch eine bessere Morph.) und dann diese Zeilen wieder löschen
+          lexEntryKeys.add(lName);
         if (! lName.equals(formName) && hasLexEntry) {
           lexEntryKeys.add(lName);
         }
@@ -72,6 +74,8 @@

   public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
     boolean hasLexEntry = false;
+    if (language.equals("zh"))   // jedes chin. einzelne Zeichen hat autom. immer einen Lexikoneintrag
+      return true;
     ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLexicons(language);
     if (statLexicons != null) {
       for (int i=0; i<statLexicons.size(); i++) {
@@ -118,6 +122,10 @@
       encodedStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr);
     } else if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) {
       encodedStr = transcoder.transcodeFromBetaCode2Unicode(inputStr);
+    } else if (fromEncoding.equals("unicode") && toEncoding.equals("betacode")) {
+      encodedStr = transcoder.transcodeFromUnicode2BetaCode(inputStr);
+    } else if (fromEncoding.equals("unicode") && toEncoding.equals("buckwalter")) {
+      encodedStr = transcoder.transcodeFromUnicode2Buckwalter(inputStr);
     }
     return encodedStr;
   }
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/util/XmlUtil.java	Tue Feb 08 14:54:09 2011 +0100
@@ -5,6 +5,7 @@
 import java.io.Reader;
 import java.io.StringReader;
 import java.io.StringWriter;
+import java.io.Writer;
 import java.net.URL;
 import java.text.DateFormat;
 import java.text.ParseException;
@@ -35,6 +36,7 @@
 import net.sf.saxon.om.NodeInfo;

 import org.w3c.dom.Document;
+import org.w3c.dom.DocumentType;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -324,6 +326,144 @@
     return retArray;
   }

+
+  /*
+   * XPath evaluation: handles both, javax and also Saxon's implementation
+   * javax XPath evaluation: returns a NodeList
+   * Saxon's XPath evaluation: returns an ArrayList of TinyTextImpl (which could be casted to NodeInfo which could be handled as if it was a dom node)
+   */
+  private String nodesetToXmlString(Object nodesetObjects) throws ApplicationException {
+    Writer writer = new StringWriter();
+    try {
+      if (nodesetObjects instanceof NodeList) {
+        NodeList resultNodeList = (NodeList) nodesetObjects;
+        int length = resultNodeList.getLength();
+        if (length > 0) {
+          for (int i=0; i<length; i++) {
+            Node n = resultNodeList.item(i);
+            serializeNode(n, writer, "");
+          }
+        }
+      } else if (nodesetObjects instanceof ArrayList) {
+        ArrayList arrayListNodes = (ArrayList) nodesetObjects;
+        for (int i=0; i<arrayListNodes.size(); i++) {
+          Object arrayListNode = arrayListNodes.get(i);
+          if (arrayListNode instanceof Node) {
+            Node n = (Node) arrayListNode;
+            serializeNode(n, writer, "");
+          } else if (arrayListNode instanceof NodeInfo) {
+            NodeInfo n = (NodeInfo) arrayListNode;
+            writer.write(n.getStringValue());  // TODO if that really happens
+          }
+        }
+      }
+      writer.flush();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return writer.toString();
+  }
+
+  public String evaluateToXmlString(String xmlString, String xpathExpression, NamespaceContext nsContext) throws ApplicationException {
+    String resultStr = null;
+    try {
+      XPath xpath = XPathFactory.newInstance().newXPath();
+      if (nsContext != null)
+        xpath.setNamespaceContext(nsContext);
+      Reader stringReader = new StringReader(xmlString);
+      InputSource inputSource = new InputSource(stringReader);
+      Object resultObjects = xpath.evaluate(xpathExpression, inputSource, XPathConstants.NODESET);
+      if (resultObjects != null) {
+        resultStr = nodesetToXmlString(resultObjects);
+      }
+    } catch (Exception e) {
+      throw new ApplicationException(e);
+    }
+    return resultStr;
+  }
+
+  /**
+   * <p> This will serialize a DOM <code>Node</code> to
+   *   the supplied <code>Writer</code>. </p>
+   *
+   * @param node DOM <code>Node</code> to serialize.
+   * @param writer <code>Writer</code> to write to.
+   * @param indentLevel current indentation.
+   */
+  private void serializeNode(Node node, Writer writer, String indentLevel) throws ApplicationException {
+    try {
+      // Determine action based on node type
+      switch (node.getNodeType()) {
+        case Node.DOCUMENT_NODE:
+          writer.write("<?xml version=\"1.0\"?>");
+          writer.write("\n");
+          // recurse on each child
+          NodeList nodes = node.getChildNodes();
+          if (nodes != null) {
+            for (int i=0; i<nodes.getLength(); i++) {
+              serializeNode(nodes.item(i), writer, "");
+            }
+          }
+          break;
+        case Node.ELEMENT_NODE:
+          String name = node.getNodeName();
+          writer.write(indentLevel + "<" + name);
+          NamedNodeMap attributes = node.getAttributes();
+          for (int i=0; i<attributes.getLength(); i++) {
+            Node current = attributes.item(i);
+            writer.write(" " + current.getNodeName() + "=\"" + current.getNodeValue() + "\"");
+          }
+          writer.write(">");
+          // recurse on each child
+          NodeList children = node.getChildNodes();
+          if (children != null) {
+            if ((children.item(0) != null) && (children.item(0).getNodeType() == Node.ELEMENT_NODE)) {
+              writer.write("\n");
+            }
+            for (int i=0; i<children.getLength(); i++) {
+              serializeNode(children.item(i), writer, indentLevel + "  ");
+            }
+            if ((children.item(0) != null) && (children.item(children.getLength()-1).getNodeType() == Node.ELEMENT_NODE)) {
+              writer.write(indentLevel);
+            }
+          }
+          writer.write("</" + name + ">");
+          writer.write("\n");
+          break;
+        case Node.TEXT_NODE:
+          writer.write(node.getNodeValue());
+          break;
+        case Node.CDATA_SECTION_NODE:
+          writer.write("<![CDATA[" + node.getNodeValue() + "]]>");
+          break;
+        case Node.COMMENT_NODE:
+          writer.write(indentLevel + "<!-- " + node.getNodeValue() + " -->");
+          writer.write("\n");
+          break;
+        case Node.PROCESSING_INSTRUCTION_NODE:
+          writer.write("<?" + node.getNodeName() + " " + node.getNodeValue() + "?>");
+          writer.write("\n");
+          break;
+        case Node.ENTITY_REFERENCE_NODE:
+          writer.write("&" + node.getNodeName() + ";");
+          break;
+        case Node.DOCUMENT_TYPE_NODE:
+          DocumentType docType = (DocumentType)node;
+          writer.write("<!DOCTYPE " + docType.getName());
+          if (docType.getPublicId() != null)  {
+            System.out.print(" PUBLIC \"" + docType.getPublicId() + "\" ");
+          } else {
+            writer.write(" SYSTEM ");
+          }
+          writer.write("\"" + docType.getSystemId() + "\">");
+          writer.write("\n");
+          break;
+      }
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
   // TODO not used yet, test it
   public Node doc(File xmlFile, File schemaFile) throws ApplicationException {
     Node root = null;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/ExternalObject.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,113 @@
+/*
+ *  eXist Open Source Native XML Database: Extension module
+ *  Copyright (C) 2008 Josef Willenborg
+ *  jwillenborg@mpiwg-berlin.mpg.de
+ *  http://www.mpiwg-berlin.mpg.de
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  $Id: TextModule.java $
+ */
+package org.exist.xquery.modules.mpdltext;
+
+import java.util.ArrayList;
+import java.util.Date;
+
+import org.exist.dom.QName;
+import org.exist.xquery.BasicFunction;
+import org.exist.xquery.Cardinality;
+import org.exist.xquery.FunctionSignature;
+import org.exist.xquery.XPathException;
+import org.exist.xquery.XQueryContext;
+import org.exist.xquery.value.Sequence;
+import org.exist.xquery.value.SequenceType;
+import org.exist.xquery.value.StringValue;
+import org.exist.xquery.value.Type;
+import org.exist.xquery.value.ValueSequence;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExtElement;
+import de.mpg.mpiwg.berlin.mpdl.externalObjects.app.ExternalObjectsHandler;
+
+/**
+ * @author Josef Willenborg (jwillenborg@mpiwg-berlin.mpg.de)
+ */
+public class ExternalObject extends BasicFunction {
+
+	public final static FunctionSignature signature =
+		new FunctionSignature(
+			new QName("externalObject", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX),
+			"A function which add, update, delete or read external elements",
+			new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+                           new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+			                     new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE) },
+			new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE));
+
+	public ExternalObject(XQueryContext context) {
+		super(context, signature);
+	}
+
+	public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
+	  Sequence operation = args[0];  // read, update or delete
+    Sequence type = args[1];
+    Sequence object = args[2];
+		if (operation.isEmpty() || type.isEmpty())
+			return Sequence.EMPTY_SEQUENCE;
+    String operationStr = operation.getStringValue();
+    String typeStr = type.getStringValue();
+    String objectStr = object.getStringValue();
+
+    ValueSequence result = null;
+    String resultStr = "";
+    try {
+      if (typeStr.equals("element")) {
+        ExtElement e = ExtElement.parseXmlStr(objectStr);
+        if (operation.equals("create") || operation.equals("update")) {
+          Date now = new Date();
+          e.setModificationDate(now);
+        }
+        String documentId = e.getDocumentId();
+        String pageNumber = e.getPageNumber();
+        if (operationStr.equals("read")) {
+          ExternalObjectsHandler externalObjectsHandler = ExternalObjectsHandler.getInstance();
+          ArrayList<ExtElement> elems = externalObjectsHandler.readExternalElements(documentId, pageNumber);
+          if (elems != null && elems.size() > 0) {
+            resultStr = "<result>";
+            for (int i=0; i<elems.size(); i++) {
+              ExtElement elem = elems.get(i);
+              String elemXmlStr = elem.getXmlString();
+              resultStr = resultStr + elemXmlStr;
+            }
+            resultStr = resultStr + "</result>";
+          }
+        } else if (operationStr.equals("create")) {
+          // TODO
+        } else if (operationStr.equals("update")) {
+          // TODO
+        } else if (operationStr.equals("delete")) {
+          // TODO
+        }
+      } else if (typeStr.equals("query")) {
+          // TODO
+      }
+      result = new ValueSequence();
+      result.add(new StringValue(resultStr));
+    } catch (ApplicationException e) {
+      throw new XPathException(e);
+    }
+		return result;
+	}
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/InsertAtCharPos.java	Tue Feb 08 14:54:09 2011 +0100
@@ -0,0 +1,103 @@
+/*
+ *  eXist Open Source Native XML Database: Extension module
+ *  Copyright (C) 2008 Josef Willenborg
+ *  jwillenborg@mpiwg-berlin.mpg.de
+ *  http://www.mpiwg-berlin.mpg.de
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  $Id: TextModule.java $
+ */
+package org.exist.xquery.modules.mpdltext;
+
+import org.exist.dom.QName;
+import org.exist.xquery.BasicFunction;
+import org.exist.xquery.Cardinality;
+import org.exist.xquery.FunctionSignature;
+import org.exist.xquery.XPathException;
+import org.exist.xquery.XQueryContext;
+import org.exist.xquery.value.Sequence;
+import org.exist.xquery.value.SequenceType;
+import org.exist.xquery.value.StringValue;
+import org.exist.xquery.value.Type;
+import org.exist.xquery.value.ValueSequence;
+
+public class InsertAtCharPos extends BasicFunction {
+
+	public final static FunctionSignature signature =
+		new FunctionSignature(
+			new QName("insertAtCharPos", MPDLTextModule.NAMESPACE_URI, MPDLTextModule.PREFIX),
+			"A function which inserts in the xml element node string (first parameter) the given xml element " +
+			"node string (second parameter) at the given char position (third parameter).",
+			new SequenceType[] { new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+			                     new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE),
+			                     new SequenceType(Type.INT, Cardinality.EXACTLY_ONE)
+			                    },
+			new SequenceType(Type.STRING, Cardinality.EXACTLY_ONE));
+
+	public InsertAtCharPos(XQueryContext context) {
+		super(context, signature);
+	}
+
+	public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException {
+	  Sequence elementInputStr = args[0];
+    Sequence insertElementStr = args[1];
+    Sequence charPosStr = args[2];
+    String elementInputStrStr = elementInputStr.getStringValue();
+    String insertElementStrStr = insertElementStr.getStringValue();
+    String charPosStrStr = charPosStr.getStringValue();
+    Integer charPos = new Integer(charPosStrStr);
+    int strCharIndex = getIndex(elementInputStrStr, charPos);
+    if (charPos == 0)
+      strCharIndex = getIndex(elementInputStrStr, charPos + 1) - 1;
+    String resultStr = elementInputStrStr.substring(0, strCharIndex) + insertElementStrStr + elementInputStrStr.substring(strCharIndex);
+		ValueSequence result = new ValueSequence();
+    result.add(new StringValue(resultStr));
+		return result;
+	}
+
+	private int getIndex(String xmlString, int charPos) {
+	  int size = xmlString.length();
+	  int counter = 0;
+    int charCounter = 0;
+    int counterLastChar = -1;
+	  boolean isEntity = false;
+    boolean isElement = false;
+	  while (counter < size) {
+      char c = xmlString.charAt(counter);
+      switch (c) {
+        case '<': isElement = true; break;
+        case '>': isElement = false; break;
+        case '&': isEntity = true; break;
+        case ';': isEntity = false; break;
+      }
+      // count all chars which are not inside elements and entities
+      // if element closing char ">" is found it should not be counted as a char
+      // if an entity closing char ";" is found it should be counted cause the entity itself is one char long
+      if (! isEntity && ! isElement && !(c == '>')) {
+        charCounter++;
+        counterLastChar = counter;
+      }
+      if (charCounter == charPos) {
+        break;
+      }
+      counter++;
+	  }
+	  // input charPos was bigger than available chars: return the last available charPos
+	  if (counter == size)
+	    return counterLastChar + 1;
+	  return counter + 1;
+	}
+}
--- a/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/org/exist/xquery/modules/mpdltext/MPDLTextModule.java	Tue Feb 08 14:54:09 2011 +0100
@@ -50,7 +50,9 @@
     new FunctionDef(GetBig5EncodedTerms.signature, GetBig5EncodedTerms.class),
     new FunctionDef(EncodeBig5.signature, EncodeBig5.class),
     new FunctionDef(LuceneQueryParser.signature, LuceneQueryParser.class),
-    new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class)
+    new FunctionDef(ExternalObject.signature, ExternalObject.class),
+    new FunctionDef(InsertAtCharPos.signature, InsertAtCharPos.class),
+    new FunctionDef(ToCLevelGenerator.signature, ToCLevelGenerator.class)
 	};

 	public MPDLTextModule() {