view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/DBLexWriter.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.dict.db;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.util.DbLoad;
import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder;

public class DBLexWriter {
  private static DBLexWriter instance;
  private static String DATA_DIR = Constants.getInstance().getDataDir();
  private static String DATA_FILES_DIR_LEXICA = DATA_DIR + "/dataFiles/pollux";
  private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux";
  private DbEnvLex dbEnvLexica;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static DBLexWriter getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new DBLexWriter();
    }
    return instance;
  }

  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    // instance.initReadOnly();
    instance.initReadWrite();
    // instance.readSampleData();
    // instance.testTranscoder();
    // instance.printSizeOfAllLexicons();
    instance.writeLexiconsToFiles();
    // instance.loadPolluxDbDumpsToDb();
    // instance.copyAndRepairAndTranscodeDumps();
    instance.end();
    instance.endOperation();
    // Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    // System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void initReadWrite() throws ApplicationException {
    dbEnvLexica = new DbEnvLex();
    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
    dbEnvLexica.initReadWrite();
  }
  
  private void initReadOnly() throws ApplicationException {
    dbEnvLexica = new DbEnvLex();
    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
    dbEnvLexica.initReadOnly();
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.openDatabase(lexiconName);
    }
  }
  
  private void loadPolluxDbDumpsToDb() throws ApplicationException {
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      loadDbDumpToDb(lexiconName);
    }
  }
  
  private void loadDbDumpToDb(String lexiconName) throws ApplicationException {
    String dumpFileName = DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".dump";
    String dbName = lexiconName + "Dump.db";
    try {
      BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName));
      DbLoad loader = new DbLoad();
      loader.setEnv(dbEnvLexica.getEnv());
      loader.setDbName(dbName);
      loader.setInputReader(bufferedReader);
      loader.setIgnoreUnknownConfig(true);
      loader.load();
      bufferedReader.close();
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void readSampleData() throws ApplicationException {
    try {
      List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
      String l1 = readEntry("autenrieth", "au)to/s");
      String l2 = readEntry("ls", "laudabilis");
      String l3 = readEntry("lsjUnicode", "ἄδρεπτος");
      String l4 = readEntry("salmoneUnicode", "ءرش");
      System.out.println("Autenrieth: autos: " + l1);
      System.out.println("Lewis & Short: Laudabilis: " + l2);
      System.out.println("LSJ: ἄδρεπτος: " + l3);
      System.out.println("Salmone: طب: " + l4);
      printSampleEntries("salmoneUnicode", 10);
      printSampleEntries("lsjUnicode", 1000);
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void end() throws ApplicationException {
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.closeDatabase(lexiconName);
      dbEnvLexica.closeDatabase(lexiconName + "Dump");
    }
    dbEnvLexica.close();
  }

  private String readEntry(String lexiconName, String formName) throws ApplicationException {
    String retString = null;
    try {
      String keyStr = formName;
      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry foundValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
      if (operationStatus == OperationStatus.SUCCESS) {
        byte[] foundValueBytes = foundValue.getData();
        retString = new String(foundValueBytes, "utf-8");
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return retString;
  }
  
  private void printSizeOfAllLexiconsTemp() throws ApplicationException {
    String lexiconName = "lsj";
    int[] sizes = getSizes(lexiconName);
    System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)");
  }
  
  private void printSizeOfAllLexicons() throws ApplicationException {
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      int[] sizes = getSizes(lexiconName);
      System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)");
    }
  }
  
  private int[] getSizes(String lexiconName) throws ApplicationException {
    int size = 0;
    int sizeXmlNotValidEntries = 0;
    try {
      dbEnvLexica.openDatabase(lexiconName);
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry dbEntryKey = new DatabaseEntry();
      DatabaseEntry dbEntryValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      while (operationStatus == OperationStatus.SUCCESS) {
        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
        byte[] dbEntryKeyBytes = dbEntryKey.getData();
        String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
        byte[] dbEntryValueBytes = dbEntryValue.getData();
        String dbEntryValueStr = new String(dbEntryValueBytes, "utf-8");
        int begin = dbEntryValueStr.indexOf("<repaired-entry>");
        int end = dbEntryValueStr.indexOf("</repaired-entry>");
        dbEntryValueStr = dbEntryValueStr.substring(begin, end) + "</repaired-entry>";
        LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr);
        LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry);
        if (! xmlLexiconEntry.isXmlValid()) {
          sizeXmlNotValidEntries ++;
        }
        size++;
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    int[] sizes = new int[2];
    sizes[0] = size;
    sizes[1] = sizeXmlNotValidEntries;
    return sizes;
  }
  
  private void copyAndRepairAndTranscodeDumps() throws ApplicationException {
    try {
      ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
      for (int i=0; i<lexicons.size(); i++) {
        Lexicon lexicon = lexicons.get(i);
        String lexiconName = lexicon.getName();
        HashMap<String, DatabaseEntry> lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump");
        dbEnvLexica.openDatabase(lexiconName);
        Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
        Iterator<String> lexDumpIter = lexDumpHashMap.keySet().iterator();
        while (lexDumpIter.hasNext()) {
          String lexDumpKeyStr = lexDumpIter.next();
          DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr);
          byte[] lexDumpValueBytes = lexDumpValue.getData();
          String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8");
          String newLexValueStr = new String(lexDumpValueBytes, "utf-8");
          // repair lsj
          if (lexiconName.equals("lsj")) {
            newLexValueStr = newLexValueStr.replaceAll("<br>", "<br/>");
            newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>");
            String elementNameGreek = "G";
            newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags <G> and </G> inside <G> 
            newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\"");
            boolean senseContained = newLexValueStr.matches(".*<sense.*>.*");
            boolean endSenseContained = newLexValueStr.matches(".*</sense>.*");
            if (senseContained && ! endSenseContained)
              newLexValueStr = newLexValueStr.replaceAll("<sense .*?>", ""); 
            else if (!senseContained && endSenseContained)
              newLexValueStr = newLexValueStr.replaceAll("</sense>", ""); 
            boolean refContained = newLexValueStr.matches(".*<ref.*>.*");
            boolean endRefContained = newLexValueStr.matches(".*</ref>.*");
            if (refContained && ! endRefContained)
              newLexValueStr = newLexValueStr.replaceAll("<ref .*?>", ""); 
            else if (!refContained && endRefContained)
              newLexValueStr = newLexValueStr.replaceAll("</ref>", ""); 
            /*
            boolean itypeContained = newLexValueStr.matches(".*<itype.*>.*");
            boolean endItypeContained = newLexValueStr.matches(".*</itype>.*");
            if (itypeContained && ! endItypeContained)
              newLexValueStr = newLexValueStr.replaceAll("<itype .*?>", ""); 
            else if (!itypeContained && endItypeContained)
              newLexValueStr = newLexValueStr.replaceAll("</itype>", "");
            */ 
          }
          // repair cooper
          if (lexiconName.equals("cooper")) {
            newLexValueStr = newLexValueStr.replaceAll("<PB>", "");   // TODO hack
            newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>");   // TODO hack
          }
          // repair baretti
          if (lexiconName.equals("baretti")) {
            newLexValueStr = newLexValueStr.replaceAll("<li>", "<li/>");   // TODO hack
          }
          // repair for all lexicons
          newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\"");
          newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\"");
          newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); 
          newLexValueStr = newLexValueStr.replaceAll("<p />", "<p/>");
          LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr);  // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code 
          LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp);
          String xmlValidString = "<xml-valid>true</xml-valid>";
          if (! newLexEntry.isXmlValid()) {
            xmlValidString = "<xml-valid>false</xml-valid>";
          }
          newLexValueStr = newLexEntry.getContent();
          // transcode the Betacode lexicon entries to Unicode (key and value)
          if (lexicon.isBetacodeLexicon()) {
            Transcoder transcoder = Transcoder.getInstance();
            lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr);
            String elementName = "G";
            if (newLexEntry.isXmlValid()) {
              newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr);
            }
          }
          // transcode the Buckwalter entries to Unicode (key and value)
          if (lexicon.isBuckwalterLexicon()) {
            Transcoder transcoder = Transcoder.getInstance();
            lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr);
            String elementName = "AR";
            if (newLexEntry.isXmlValid()) {
              newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr);
            }
          }
          // put the entry into database 
          newLexValueStr = "<content>" + xmlValidString + "<original-entry>" + lexDumpValueStr + "</original-entry>" + "<repaired-entry>" + newLexValueStr + "</repaired-entry>" + "</content>";
          DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8"));
          DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8"));
          lexDB.put(null, newLexDumpKey, newLexValue);
        }
      }
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void printSampleEntries(String lexiconName, int count) throws ApplicationException {
    try {
      int counter = 0;
      dbEnvLexica.openDatabase(lexiconName);
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry dbEntryKey = new DatabaseEntry();
      DatabaseEntry dbEntryValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      while (operationStatus == OperationStatus.SUCCESS  && counter < count) {
        int size = dbEntryKey.getSize();
        if (size > 0) {
          byte[] dbEntryKeyBytes = dbEntryKey.getData();
          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
          System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " +  dbEntryValue.getSize());
        }
        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
        counter++;
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
  }

  private void testTranscoder() throws ApplicationException {
    String testStr = "<G>hfhf fdfd<G>ei)mi/</G> (<tr>sum</tr>), Aeol. <G>e)/mmi</G> hfhfh </G><author>Sapph.</author>2.15, <author>Theoc.</author>20.32; Cret. <G>h)mi/</G> <title>GDI</title> 4959a; <per>2</per><number>sg.</number> <G>ei)=</G>, Ep. and Ion. <cit><G>ei)s</G> <author>Od.</author>17.388</cit>, al., Aeol. <G>e)/ssi</G>, Ep. and Dor. <cit><G>e)ssi/</G> <author>Il.</author>1.176</cit>, <author>Pi.</author>";
    String testStr2 = "aaaaa <G>1111a <G>2222a</G> <G>3333a</G> 1111a</G> aaaaa bbbbb <G>1111b <G>2222b</G> <G>3333b</G> 1111b</G> bbbbb ";
    String testStr3 = "<G>e)pano/rqwsin e)/xein</G>, opp <G>a)ni/aton ei)=nai *hi</G>3. 1165 b18. --<G>e)panorqw/seis kai boh/qeiai *rb</G>5. 1383 a20.";
    String testStr4 = "<G>suni^hmi</G> <author>Ar.</author><title>Av.</title>946 (s. v.l.), <author>Strato Com.</author>1.3: with variation of quantity, <G>plei=ston ou)=lon i(/ei <G>[i^]</G>, i)/oulon i(/ei [i_</G>] <title>Carm.Pop.</title> 1.]:&#x2014" +
                         ";<br><tr>release, let go</tr>, <cit><G>h(=ka ..po/das kai\\ xei=re fe/resqai</G> <author>Od.</author>12.442</cit>; <G>h(=ke fe/resqai</G> <tr>let</tr> him float" + 
                         "off, <author>Il.</author>21.120; <tr>let fall</tr>, <G>ka\\d de\\ ka/rhtos h(=ke ko/mas</G> <tr>made</tr> his locks <tr>flow</tr> down from his head, <author>Od.<" +
                         "/author>6.231; [<cit><G>e)qei/ras] i(/ei lo/fon a)mfi/</G> .... ggg";
    String testStr5 = "plei=ston ou)=lon i(/ei ";
    String testStr6 = "*a as< as as: *)a *s ss ";
    Transcoder t = Transcoder.getInstance();
    String transcoded = t.transcodeFromBetaCode2Unicode(testStr4);
    transcoded = t.transcodeFromBetaCode2Unicode(testStr5);
    transcoded = t.transcodeFromBetaCode2Unicode(testStr6);
    
    String arabTestStr1 = "^nutaf";
    String arabTestStr2 = "min";
    String arabTestStr3 = "Aal-Hiyal (^qAla ^&gt;arisTwTAlys) yataEaj~aba Aal-nAs minhA &lt;im~A fy Aal-&gt;a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-&lt;im~A fy Aal-&gt;a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li-&gt;an~a Aal-TabyEap tulzimu &gt;abadAF jihap wAHidap wa-&gt;am~A manAfiE Aal-nAs fa-&lt;in~ahA taxtalifu &lt;ixtilAfAF kavyrAF.";
    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1);
    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2);
    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3);
    
    // String deletedNestedTags = deleteNestedTags("G", testStr4);
    // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)";
    String regExpr = "(<G>.*?)<G>(.*)(</G>){1,}(.*?</G>)";
    // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)";
    String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4");
    // String replaceStr2 = testStr2.replaceAll("<G>(.*)<G>(.*)</G>(.*)<G>(.*)</G>(.*)</G>", "<G>$2$3$4$5</G>");
    regExpr = "<G>.*?(<G>.*?</G>){1,}.*?</G>";
    regExpr = "(<G>.*?)<G>(.*?)</G>(.*?){1,}(.*?</G>)";
    // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]"
    Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled
    Matcher m = p.matcher(testStr2);
    while (m.find()) {
      int msBeginPos = m.start();
      int msEndPos = m.end();
      String matchStr = testStr2.substring(msBeginPos, msEndPos);
      String bla = "";
    }

    String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr);
    retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla");
    retStr = transcodeByElementName("fromBetacode2Unicode", "G", "");
  }
  
  private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException {
    if (inputStr == null || elementName == null)
      return null;
    String elemBeginTag = "<" + elementName + ">";
    String elemEndTag = "</" + elementName + ">";
    Transcoder transcoder = Transcoder.getInstance();
    String outputStr = "";
    int begin = inputStr.indexOf(elemBeginTag);
    int end = inputStr.indexOf(elemEndTag);
    while (begin != -1 && end != -1 && begin < end) {
      String before = inputStr.substring(0, begin);
      String origStr = inputStr.substring(begin + elemBeginTag.length(), end);
      origStr = StringUtils.deleteSpecialXmlEntities(origStr);
      String transcodedStr = origStr;
      if (transcodeDirection.equals("fromBetacode2Unicode"))
        transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr);
      else if (transcodeDirection.equals("fromBuckwalter2Unicode"))
        transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr);
      outputStr = outputStr + before + new String(elemBeginTag);
      outputStr = outputStr + transcodedStr;
      outputStr = outputStr + new String(elemEndTag);
      inputStr = inputStr.substring(end + elemEndTag.length());
      begin = inputStr.indexOf(elemBeginTag);
      end = inputStr.indexOf(elemEndTag);
    }
    outputStr = outputStr + inputStr;
    return outputStr;
  }
  
  private String deleteNestedTags(String elementName, String inputStr) {
    String inputStrTmp = new String(inputStr);
    String elemBeginTag = "<" + elementName + ">";
    String elemEndTag = "</" + elementName + ">";
    String outputStr = "";
    int begin = inputStrTmp.indexOf(elemBeginTag);
    int end = inputStrTmp.indexOf(elemEndTag);
    while (begin != -1 && end != -1) {
      end = getIndexClosedTag(begin, elementName, inputStrTmp);
      String before = inputStrTmp.substring(0, begin);
      String origStr = null;
      if (end == -1) // if no end tag could be found
        origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length());
      else
        origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end);
      origStr = origStr.replaceAll(elemBeginTag, "");
      origStr = origStr.replaceAll(elemEndTag, "");
      outputStr = outputStr + before + new String(elemBeginTag);
      outputStr = outputStr + origStr;
      outputStr = outputStr + new String(elemEndTag);
      inputStrTmp = inputStrTmp.substring(end + elemEndTag.length());
      begin = inputStrTmp.indexOf(elemBeginTag);
    }
    outputStr = outputStr + inputStrTmp;
    return outputStr;
  }
  
  private int getIndexClosedTag(int begin, String elementName, String inputStr) {
    int beginTmp = begin;
    int retIndex = -1;
    String elemBeginTag = "<" + elementName + ">";
    String elemEndTag = "</" + elementName + ">";
    int indexEndTag = inputStr.indexOf(elemEndTag);
    while (indexEndTag != -1) {
      String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag);
      int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag);
      if (indexBeginTag != -1) {
        beginTmp = indexEndTag;
      } else {
        return indexEndTag;
      }
      indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length());
    }
    return retIndex;
  }
  
  private HashMap<String, DatabaseEntry> getWholeLexiconHashMap(String lexiconName) throws ApplicationException {
    HashMap<String, DatabaseEntry> lexHashMap = new HashMap<String, DatabaseEntry>();
    try {
      dbEnvLexica.openDatabase(lexiconName);
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry dbEntryKey = new DatabaseEntry();
      DatabaseEntry dbEntryValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      while (operationStatus == OperationStatus.SUCCESS) {
        int size = dbEntryKey.getSize();
        if (size > 0) {
          byte[] dbEntryKeyBytes = dbEntryKey.getData();
          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
          DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData());
          lexHashMap.put(dbEntryKeyStr, newDbEntryValue);
        }
        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return lexHashMap;
  }
  
  private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException {
    String origLexEntryContent = lexEntry.getContent();
    String lexEntryContent = new String(origLexEntryContent);
    lexEntry.setContent(lexEntryContent);
    // parse and repair: try to repair it 3 times through parsing
    LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry);
    retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry);
    retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry);
    // if it could not be repaired the original content (which is not XML valid) is delivered
    if (! retLexiconEntry.isXmlValid())
      retLexiconEntry.setContent(origLexEntryContent);
    return retLexiconEntry;
  }

  private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException {
    if (! lexEntry.isXmlValid()) {
      lexEntry = xmlParse(lexEntry);
    }
    if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) {
      String elementName = lexEntry.getValidationFailElementName();
      String lexiconEntryContent = lexEntry.getContent();
      lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", "");
      lexiconEntryContent = lexiconEntryContent.replaceAll("</" + elementName + ">", "");
      lexEntry.setContent(lexiconEntryContent);
      lexEntry.setXmlMadeValid(true);    
    }
    return lexEntry;
  }
  
  private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException {
    String lexEntryContent = "<content>" + lexEntry.getContent() + "</content>";
    LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler();
    XMLReader xmlParser = new SAXParser();
    xmlParser.setContentHandler(lexEntryContentHandler);
    LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler();
    xmlParser.setErrorHandler(lexEntryErrorHandler);
    try {
      Reader reader = new StringReader(lexEntryContent);
      InputSource input = new InputSource(reader);
      xmlParser.parse(input);
      lexEntry.setXmlValid(true);
    } catch (SAXException e) {
      // nothing but following
      lexEntry.setXmlValid(false);
      String exceptionMessage = e.getMessage();
      if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) {
        int begin = exceptionMessage.indexOf("\"");
        if (begin != -1) {
          String subStr = exceptionMessage.substring(begin + 1);
          int end = subStr.indexOf("\"");
          if (end != -1) {
            String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end);
            lexEntry.setValidationCode("elementNotClosed");
            lexEntry.setValidationFailElementName(elementName);
          }
        }
      }
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return lexEntry;
  }

  private void writeLexiconsToFiles() throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    try {
      ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
      for (int i=0; i<lexicons.size(); i++) {
        Lexicon lexicon = lexicons.get(i);
        String lexiconName = lexicon.getName();
        HashMap<String, DatabaseEntry> lexHashMap = getWholeLexiconHashMap(lexiconName);
        Iterator<String> lexDumpIter = lexHashMap.keySet().iterator();
        File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml");
        out = new BufferedOutputStream(new FileOutputStream(outputFile));
        write("<lexicon>\n", out);
        write("<name>" + lexiconName + "</name>\n", out);
        write("<description>" + lexicon.getDescription() + "</description>\n", out);
        write("<entries>\n", out);
        while (lexDumpIter.hasNext()) {
          write("<entry>\n", out);
          String lexKeyStr = lexDumpIter.next();
          write("<form>" + lexKeyStr + "</form>\n", out);
          DatabaseEntry lexValue = lexHashMap.get(lexKeyStr);
          byte[] lexValueBytes = lexValue.getData();
          write(lexValueBytes, out);
          write("</entry>\n", out);
        }
        write("</entries>\n", out);
        write("</lexicon>\n", out);
      }
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }
  
  private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException {
    try {
      out.write(inputBytes, 0, inputBytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private void write(String outStr, BufferedOutputStream out) throws ApplicationException {
    try {
      byte[] bytes = outStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}