Mercurial > hg > mpdl-group

diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java @ 0:408254cf2f1d
Erstellung
author: Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date: Wed, 24 Nov 2010 17:24:23 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/DBLexWriter.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,630 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.lex.db;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+import com.sleepycat.je.util.DbLoad;
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry;
+
+public class DBLexWriter {
+  private static DBLexWriter instance;
+  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR;
+  private static String DATA_FILES_DIR_LEXICA = MPDL_DATA_DIR + "/dataFiles/pollux";
+  private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux";
+  private DbEnvLex dbEnvLexica;
+  private Date beginOfOperation;
+  private Date endOfOperation;
+  
+  public static DBLexWriter getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new DBLexWriter();
+    }
+    return instance;
+  }
+
+  public static void main(String[] args) throws ApplicationException {
+    getInstance();
+    instance.beginOperation();
+    System.out.print("Start ...");
+    // instance.initReadOnly();
+    instance.initReadWrite();
+    // instance.readSampleData();
+    // instance.testTranscoder();
+    // instance.printSizeOfAllLexicons();
+    instance.writeLexiconsToFiles();
+    // instance.loadPolluxDbDumpsToDb();
+    // instance.copyAndRepairAndTranscodeDumps();
+    instance.end();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    System.out.println("End.");
+    System.out.println("Needed time: " + elapsedTime + " seconds");
+  }
+
+  private void initReadWrite() throws ApplicationException {
+    dbEnvLexica = new DbEnvLex();
+    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
+    dbEnvLexica.initReadWrite();
+  }
+  
+  private void initReadOnly() throws ApplicationException {
+    dbEnvLexica = new DbEnvLex();
+    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
+    dbEnvLexica.initReadOnly();
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      dbEnvLexica.openDatabase(lexiconName);
+    }
+  }
+  
+  private void loadPolluxDbDumpsToDb() throws ApplicationException {
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      loadDbDumpToDb(lexiconName);
+    }
+  }
+  
+  private void loadDbDumpToDb(String lexiconName) throws ApplicationException {
+    String dumpFileName = DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".dump";
+    String dbName = lexiconName + "Dump.db";
+    try {
+      BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName));
+      DbLoad loader = new DbLoad();
+      loader.setEnv(dbEnvLexica.getEnv());
+      loader.setDbName(dbName);
+      loader.setInputReader(bufferedReader);
+      loader.setIgnoreUnknownConfig(true);
+      loader.load();
+      bufferedReader.close();
+    } catch (FileNotFoundException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void readSampleData() throws ApplicationException {
+    try {
+      List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
+      String l1 = readEntry("autenrieth", "au)to/s");
+      String l2 = readEntry("ls", "laudabilis");
+      String l3 = readEntry("lsjUnicode", "ἄδρεπτος");
+      String l4 = readEntry("salmoneUnicode", "ءرش");
+      System.out.println("Autenrieth: autos: " + l1);
+      System.out.println("Lewis & Short: Laudabilis: " + l2);
+      System.out.println("LSJ: ἄδρεπτος: " + l3);
+      System.out.println("Salmone: طب: " + l4);
+      printSampleEntries("salmoneUnicode", 10);
+      printSampleEntries("lsjUnicode", 1000);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void end() throws ApplicationException {
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      dbEnvLexica.closeDatabase(lexiconName);
+      dbEnvLexica.closeDatabase(lexiconName + "Dump");
+    }
+    dbEnvLexica.close();
+  }
+
+  private String readEntry(String lexiconName, String formName) throws ApplicationException {
+    String retString = null;
+    try {
+      String keyStr = formName;
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+      Cursor cursor = lexDB.openCursor(null, null);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      if (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        retString = new String(foundValueBytes, "utf-8");
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return retString;
+  }
+  
+  private void printSizeOfAllLexiconsTemp() throws ApplicationException {
+    String lexiconName = "lsj";
+    int[] sizes = getSizes(lexiconName);
+    System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)");
+  }
+  
+  private void printSizeOfAllLexicons() throws ApplicationException {
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      int[] sizes = getSizes(lexiconName);
+      System.out.println(lexiconName + ": " + sizes[0] + " records (" + sizes[1] + " of them are not xml valid)");
+    }
+  }
+  
+  private int[] getSizes(String lexiconName) throws ApplicationException {
+    int size = 0;
+    int sizeXmlNotValidEntries = 0;
+    try {
+      dbEnvLexica.openDatabase(lexiconName);
+      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+      Cursor cursor = lexDB.openCursor(null, null);
+      DatabaseEntry dbEntryKey = new DatabaseEntry();
+      DatabaseEntry dbEntryValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+        byte[] dbEntryKeyBytes = dbEntryKey.getData();
+        String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
+        byte[] dbEntryValueBytes = dbEntryValue.getData();
+        String dbEntryValueStr = new String(dbEntryValueBytes, "utf-8");
+        int begin = dbEntryValueStr.indexOf("<repaired-entry>");
+        int end = dbEntryValueStr.indexOf("</repaired-entry>");
+        dbEntryValueStr = dbEntryValueStr.substring(begin, end) + "</repaired-entry>";
+        LexiconEntry dbLexEntry = new LexiconEntry(lexiconName, dbEntryKeyStr, dbEntryValueStr);
+        LexiconEntry xmlLexiconEntry = xmlParse(dbLexEntry);
+        if (! xmlLexiconEntry.isXmlValid()) {
+          sizeXmlNotValidEntries ++;
+        }
+        size++;
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    int[] sizes = new int[2];
+    sizes[0] = size;
+    sizes[1] = sizeXmlNotValidEntries;
+    return sizes;
+  }
+  
+  private void copyAndRepairAndTranscodeDumps() throws ApplicationException {
+    try {
+      ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+      for (int i=0; i<lexicons.size(); i++) {
+        Lexicon lexicon = lexicons.get(i);
+        String lexiconName = lexicon.getName();
+        HashMap<String, DatabaseEntry> lexDumpHashMap = getWholeLexiconHashMap(lexiconName + "Dump");
+        dbEnvLexica.openDatabase(lexiconName);
+        Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+        Iterator<String> lexDumpIter = lexDumpHashMap.keySet().iterator();
+        while (lexDumpIter.hasNext()) {
+          String lexDumpKeyStr = lexDumpIter.next();
+          DatabaseEntry lexDumpValue = lexDumpHashMap.get(lexDumpKeyStr);
+          byte[] lexDumpValueBytes = lexDumpValue.getData();
+          String lexDumpValueStr = new String(lexDumpValueBytes, "utf-8");
+          String newLexValueStr = new String(lexDumpValueBytes, "utf-8");
+          // repair lsj
+          if (lexiconName.equals("lsj")) {
+            newLexValueStr = newLexValueStr.replaceAll("<br>", "<br/>");
+            newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>");
+            String elementNameGreek = "G";
+            newLexValueStr = deleteNestedTags(elementNameGreek, newLexValueStr); // delete tags <G> and </G> inside <G> 
+            newLexValueStr = newLexValueStr.replaceAll("lang=greek", "lang=\"greek\"");
+            boolean senseContained = newLexValueStr.matches(".*<sense.*>.*");
+            boolean endSenseContained = newLexValueStr.matches(".*</sense>.*");
+            if (senseContained && ! endSenseContained)
+              newLexValueStr = newLexValueStr.replaceAll("<sense .*?>", ""); 
+            else if (!senseContained && endSenseContained)
+              newLexValueStr = newLexValueStr.replaceAll("</sense>", ""); 
+            boolean refContained = newLexValueStr.matches(".*<ref.*>.*");
+            boolean endRefContained = newLexValueStr.matches(".*</ref>.*");
+            if (refContained && ! endRefContained)
+              newLexValueStr = newLexValueStr.replaceAll("<ref .*?>", ""); 
+            else if (!refContained && endRefContained)
+              newLexValueStr = newLexValueStr.replaceAll("</ref>", ""); 
+            /*
+            boolean itypeContained = newLexValueStr.matches(".*<itype.*>.*");
+            boolean endItypeContained = newLexValueStr.matches(".*</itype>.*");
+            if (itypeContained && ! endItypeContained)
+              newLexValueStr = newLexValueStr.replaceAll("<itype .*?>", ""); 
+            else if (!itypeContained && endItypeContained)
+              newLexValueStr = newLexValueStr.replaceAll("</itype>", "");
+            */ 
+          }
+          // repair cooper
+          if (lexiconName.equals("cooper")) {
+            newLexValueStr = newLexValueStr.replaceAll("<PB>", "");   // TODO hack
+            newLexValueStr = newLexValueStr.replaceAll("<p>", "<p/>");   // TODO hack
+          }
+          // repair baretti
+          if (lexiconName.equals("baretti")) {
+            newLexValueStr = newLexValueStr.replaceAll("<li>", "<li/>");   // TODO hack
+          }
+          // repair for all lexicons
+          newLexValueStr = newLexValueStr.replaceAll("type=style", "type=\"style\"");
+          newLexValueStr = newLexValueStr.replaceAll("type=dom", "type=\"dom\"");
+          newLexValueStr = newLexValueStr.replaceAll("<\\*>", ""); 
+          newLexValueStr = newLexValueStr.replaceAll("<p />", "<p/>");
+          LexiconEntry newLexEntryTemp = new LexiconEntry(lexiconName, lexDumpKeyStr, newLexValueStr);  // lexDumpKeyStr is not transcoded yet but it will not be used in further in the code 
+          LexiconEntry newLexEntry = xmlParseAndRepair(newLexEntryTemp);
+          String xmlValidString = "<xml-valid>true</xml-valid>";
+          if (! newLexEntry.isXmlValid()) {
+            xmlValidString = "<xml-valid>false</xml-valid>";
+          }
+          newLexValueStr = newLexEntry.getContent();
+          // transcode the Betacode lexicon entries to Unicode (key and value)
+          if (lexicon.isBetacodeLexicon()) {
+            Transcoder transcoder = Transcoder.getInstance();
+            lexDumpKeyStr = transcoder.transcodeFromBetaCode2Unicode(lexDumpKeyStr);
+            String elementName = "G";
+            if (newLexEntry.isXmlValid()) {
+              newLexValueStr = transcodeByElementName("fromBetacode2Unicode", elementName, newLexValueStr);
+            }
+          }
+          // transcode the Buckwalter entries to Unicode (key and value)
+          if (lexicon.isBuckwalterLexicon()) {
+            Transcoder transcoder = Transcoder.getInstance();
+            lexDumpKeyStr = transcoder.transcodeFromBuckwalter2Unicode(lexDumpKeyStr);
+            String elementName = "AR";
+            if (newLexEntry.isXmlValid()) {
+              newLexValueStr = transcodeByElementName("fromBuckwalter2Unicode", elementName, newLexValueStr);
+            }
+          }
+          // put the entry into database 
+          newLexValueStr = "<content>" + xmlValidString + "<original-entry>" + lexDumpValueStr + "</original-entry>" + "<repaired-entry>" + newLexValueStr + "</repaired-entry>" + "</content>";
+          DatabaseEntry newLexDumpKey = new DatabaseEntry(lexDumpKeyStr.getBytes("utf-8"));
+          DatabaseEntry newLexValue = new DatabaseEntry(newLexValueStr.getBytes("utf-8"));
+          lexDB.put(null, newLexDumpKey, newLexValue);
+        }
+      }
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void printSampleEntries(String lexiconName, int count) throws ApplicationException {
+    try {
+      int counter = 0;
+      dbEnvLexica.openDatabase(lexiconName);
+      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+      Cursor cursor = lexDB.openCursor(null, null);
+      DatabaseEntry dbEntryKey = new DatabaseEntry();
+      DatabaseEntry dbEntryValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS  && counter < count) {
+        int size = dbEntryKey.getSize();
+        if (size > 0) {
+          byte[] dbEntryKeyBytes = dbEntryKey.getData();
+          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
+          System.out.println(lexiconName + ": key: " + dbEntryKeyStr + " value size: " +  dbEntryValue.getSize());
+        }
+        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+        counter++;
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void testTranscoder() throws ApplicationException {
+    String testStr = "<G>hfhf fdfd<G>ei)mi/</G> (<tr>sum</tr>), Aeol. <G>e)/mmi</G> hfhfh </G><author>Sapph.</author>2.15, <author>Theoc.</author>20.32; Cret. <G>h)mi/</G> <title>GDI</title> 4959a; <per>2</per><number>sg.</number> <G>ei)=</G>, Ep. and Ion. <cit><G>ei)s</G> <author>Od.</author>17.388</cit>, al., Aeol. <G>e)/ssi</G>, Ep. and Dor. <cit><G>e)ssi/</G> <author>Il.</author>1.176</cit>, <author>Pi.</author>";
+    String testStr2 = "aaaaa <G>1111a <G>2222a</G> <G>3333a</G> 1111a</G> aaaaa bbbbb <G>1111b <G>2222b</G> <G>3333b</G> 1111b</G> bbbbb ";
+    String testStr3 = "<G>e)pano/rqwsin e)/xein</G>, opp <G>a)ni/aton ei)=nai *hi</G>3. 1165 b18. --<G>e)panorqw/seis kai boh/qeiai *rb</G>5. 1383 a20.";
+    String testStr4 = "<G>suni^hmi</G> <author>Ar.</author><title>Av.</title>946 (s. v.l.), <author>Strato Com.</author>1.3: with variation of quantity, <G>plei=ston ou)=lon i(/ei <G>[i^]</G>, i)/oulon i(/ei [i_</G>] <title>Carm.Pop.</title> 1.]:&#x2014" +
+                         ";<br><tr>release, let go</tr>, <cit><G>h(=ka ..po/das kai\\ xei=re fe/resqai</G> <author>Od.</author>12.442</cit>; <G>h(=ke fe/resqai</G> <tr>let</tr> him float" + 
+                         "off, <author>Il.</author>21.120; <tr>let fall</tr>, <G>ka\\d de\\ ka/rhtos h(=ke ko/mas</G> <tr>made</tr> his locks <tr>flow</tr> down from his head, <author>Od.<" +
+                         "/author>6.231; [<cit><G>e)qei/ras] i(/ei lo/fon a)mfi/</G> .... ggg";
+    String testStr5 = "plei=ston ou)=lon i(/ei ";
+    String testStr6 = "*a as< as as: *)a *s ss ";
+    Transcoder t = Transcoder.getInstance();
+    String transcoded = t.transcodeFromBetaCode2Unicode(testStr4);
+    transcoded = t.transcodeFromBetaCode2Unicode(testStr5);
+    transcoded = t.transcodeFromBetaCode2Unicode(testStr6);
+    
+    String arabTestStr1 = "^nutaf";
+    String arabTestStr2 = "min";
+    String arabTestStr3 = "Aal-Hiyal (^qAla ^&gt;arisTwTAlys) yataEaj~aba Aal-nAs minhA &lt;im~A fy Aal-&gt;a$yA' Aal~aty taEriDu TabEAF fa-mim~A lA yuElamu Eil~atuhu wa-&lt;im~A fy Aal-&gt;a$yA' Aal-muxAlifap li-l-TabE fa-mim~A yuEmalu bi-Aal-SinAEap li-manfaEap Aal-nAs li-&gt;an~a Aal-TabyEap tulzimu &gt;abadAF jihap wAHidap wa-&gt;am~A manAfiE Aal-nAs fa-&lt;in~ahA taxtalifu &lt;ixtilAfAF kavyrAF.";
+    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr1);
+    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr2);
+    transcoded = t.transcodeFromBuckwalter2Unicode(arabTestStr3);
+    
+    // String deletedNestedTags = deleteNestedTags("G", testStr4);
+    // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)";
+    String regExpr = "(<G>.*?)<G>(.*)(</G>){1,}(.*?</G>)";
+    // String regExpr = "(<G>.*?)<G>(.*?)</G>(.*?)<G>(.*?)</G>(.*?</G>)";
+    String replaceStr = testStr2.replaceAll(regExpr, "$1$2$4");
+    // String replaceStr2 = testStr2.replaceAll("<G>(.*)<G>(.*)</G>(.*)<G>(.*)</G>(.*)</G>", "<G>$2$3$4$5</G>");
+    regExpr = "<G>.*?(<G>.*?</G>){1,}.*?</G>";
+    regExpr = "(<G>.*?)<G>(.*?)</G>(.*?){1,}(.*?</G>)";
+    // String regExpr = "[a-zA-Z0-9]+?\\[.+?\\]/" + "|" + "[a-zA-Z0-9]+?/" + "|" + "[a-zA-Z0-9]+?\\[.+\\]$" + "|" + "[a-zA-Z0-9]+?$"; // pathName example: "/archimedes[@xmlns:xlink eq "http://www.w3.org/1999/xlink"]/text/body/chap/p[@type eq "main"]/s/foreign[@lang eq "en"]"
+    Pattern p = Pattern.compile(regExpr, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // both flags enabled
+    Matcher m = p.matcher(testStr2);
+    while (m.find()) {
+      int msBeginPos = m.start();
+      int msEndPos = m.end();
+      String matchStr = testStr2.substring(msBeginPos, msEndPos);
+      String bla = "";
+    }
+
+    String retStr = transcodeByElementName("fromBetacode2Unicode", "G", testStr);
+    retStr = transcodeByElementName("fromBetacode2Unicode", "G", "bla");
+    retStr = transcodeByElementName("fromBetacode2Unicode", "G", "");
+  }
+  
+  private String transcodeByElementName(String transcodeDirection, String elementName, String inputStr) throws ApplicationException {
+    if (inputStr == null || elementName == null)
+      return null;
+    String elemBeginTag = "<" + elementName + ">";
+    String elemEndTag = "</" + elementName + ">";
+    Transcoder transcoder = Transcoder.getInstance();
+    String outputStr = "";
+    int begin = inputStr.indexOf(elemBeginTag);
+    int end = inputStr.indexOf(elemEndTag);
+    while (begin != -1 && end != -1 && begin < end) {
+      String before = inputStr.substring(0, begin);
+      String origStr = inputStr.substring(begin + elemBeginTag.length(), end);
+      origStr = StringUtilEscapeChars.deleteSpecialXmlEntities(origStr);
+      String transcodedStr = origStr;
+      if (transcodeDirection.equals("fromBetacode2Unicode"))
+        transcodedStr = transcoder.transcodeFromBetaCode2Unicode(origStr);
+      else if (transcodeDirection.equals("fromBuckwalter2Unicode"))
+        transcodedStr = transcoder.transcodeFromBuckwalter2Unicode(origStr);
+      outputStr = outputStr + before + new String(elemBeginTag);
+      outputStr = outputStr + transcodedStr;
+      outputStr = outputStr + new String(elemEndTag);
+      inputStr = inputStr.substring(end + elemEndTag.length());
+      begin = inputStr.indexOf(elemBeginTag);
+      end = inputStr.indexOf(elemEndTag);
+    }
+    outputStr = outputStr + inputStr;
+    return outputStr;
+  }
+  
+  private String deleteNestedTags(String elementName, String inputStr) {
+    String inputStrTmp = new String(inputStr);
+    String elemBeginTag = "<" + elementName + ">";
+    String elemEndTag = "</" + elementName + ">";
+    String outputStr = "";
+    int begin = inputStrTmp.indexOf(elemBeginTag);
+    int end = inputStrTmp.indexOf(elemEndTag);
+    while (begin != -1 && end != -1) {
+      end = getIndexClosedTag(begin, elementName, inputStrTmp);
+      String before = inputStrTmp.substring(0, begin);
+      String origStr = null;
+      if (end == -1) // if no end tag could be found
+        origStr = inputStrTmp.substring(begin + elemBeginTag.length(), inputStrTmp.length());
+      else
+        origStr = inputStrTmp.substring(begin + elemBeginTag.length(), end);
+      origStr = origStr.replaceAll(elemBeginTag, "");
+      origStr = origStr.replaceAll(elemEndTag, "");
+      outputStr = outputStr + before + new String(elemBeginTag);
+      outputStr = outputStr + origStr;
+      outputStr = outputStr + new String(elemEndTag);
+      inputStrTmp = inputStrTmp.substring(end + elemEndTag.length());
+      begin = inputStrTmp.indexOf(elemBeginTag);
+    }
+    outputStr = outputStr + inputStrTmp;
+    return outputStr;
+  }
+  
+  private int getIndexClosedTag(int begin, String elementName, String inputStr) {
+    int beginTmp = begin;
+    int retIndex = -1;
+    String elemBeginTag = "<" + elementName + ">";
+    String elemEndTag = "</" + elementName + ">";
+    int indexEndTag = inputStr.indexOf(elemEndTag);
+    while (indexEndTag != -1) {
+      String betweenTmpStr = inputStr.substring(beginTmp + elemBeginTag.length(), indexEndTag);
+      int indexBeginTag = betweenTmpStr.indexOf(elemBeginTag);
+      if (indexBeginTag != -1) {
+        beginTmp = indexEndTag;
+      } else {
+        return indexEndTag;
+      }
+      indexEndTag = inputStr.indexOf(elemEndTag, indexEndTag + elemEndTag.length());
+    }
+    return retIndex;
+  }
+  
+  private HashMap<String, DatabaseEntry> getWholeLexiconHashMap(String lexiconName) throws ApplicationException {
+    HashMap<String, DatabaseEntry> lexHashMap = new HashMap<String, DatabaseEntry>();
+    try {
+      dbEnvLexica.openDatabase(lexiconName);
+      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+      Cursor cursor = lexDB.openCursor(null, null);
+      DatabaseEntry dbEntryKey = new DatabaseEntry();
+      DatabaseEntry dbEntryValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        int size = dbEntryKey.getSize();
+        if (size > 0) {
+          byte[] dbEntryKeyBytes = dbEntryKey.getData();
+          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
+          DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData());
+          lexHashMap.put(dbEntryKeyStr, newDbEntryValue);
+        }
+        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return lexHashMap;
+  }
+  
+  private LexiconEntry xmlParseAndRepair(LexiconEntry lexEntry) throws ApplicationException {
+    String origLexEntryContent = lexEntry.getContent();
+    String lexEntryContent = new String(origLexEntryContent);
+    lexEntry.setContent(lexEntryContent);
+    // parse and repair: try to repair it 3 times through parsing
+    LexiconEntry retLexiconEntry = xmParseAndRepairLocal(lexEntry);
+    retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry);
+    retLexiconEntry = xmParseAndRepairLocal(retLexiconEntry);
+    // if it could not be repaired the original content (which is not XML valid) is delivered
+    if (! retLexiconEntry.isXmlValid())
+      retLexiconEntry.setContent(origLexEntryContent);
+    return retLexiconEntry;
+  }
+
+  private LexiconEntry xmParseAndRepairLocal(LexiconEntry lexEntry) throws ApplicationException {
+    if (! lexEntry.isXmlValid()) {
+      lexEntry = xmlParse(lexEntry);
+    }
+    if (! lexEntry.isXmlValid() && lexEntry.getValidationCode() != null && lexEntry.getValidationCode().equals("elementNotClosed")) {
+      String elementName = lexEntry.getValidationFailElementName();
+      String lexiconEntryContent = lexEntry.getContent();
+      lexiconEntryContent = lexiconEntryContent.replaceAll("<" + elementName + " .*?>", "");
+      lexiconEntryContent = lexiconEntryContent.replaceAll("</" + elementName + ">", "");
+      lexEntry.setContent(lexiconEntryContent);
+      lexEntry.setXmlMadeValid(true);    
+    }
+    return lexEntry;
+  }
+  
+  private LexiconEntry xmlParse(LexiconEntry lexEntry) throws ApplicationException {
+    String lexEntryContent = "<content>" + lexEntry.getContent() + "</content>";
+    LexEntryContentHandler lexEntryContentHandler = new LexEntryContentHandler();
+    XMLReader xmlParser = new SAXParser();
+    xmlParser.setContentHandler(lexEntryContentHandler);
+    LexEntryErrorHandler lexEntryErrorHandler = new LexEntryErrorHandler();
+    xmlParser.setErrorHandler(lexEntryErrorHandler);
+    try {
+      Reader reader = new StringReader(lexEntryContent);
+      InputSource input = new InputSource(reader);
+      xmlParser.parse(input);
+      lexEntry.setXmlValid(true);
+    } catch (SAXException e) {
+      // nothing but following
+      lexEntry.setXmlValid(false);
+      String exceptionMessage = e.getMessage();
+      if (exceptionMessage.matches("The element type .* must be terminated by the matching end-tag .*")) {
+        int begin = exceptionMessage.indexOf("\"");
+        if (begin != -1) {
+          String subStr = exceptionMessage.substring(begin + 1);
+          int end = subStr.indexOf("\"");
+          if (end != -1) {
+            String elementName = exceptionMessage.substring(begin + 1, begin + 1 + end);
+            lexEntry.setValidationCode("elementNotClosed");
+            lexEntry.setValidationFailElementName(elementName);
+          }
+        }
+      }
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return lexEntry;
+  }
+
+  private void writeLexiconsToFiles() throws ApplicationException {
+    BufferedReader in = null;
+    BufferedOutputStream out = null;
+    try {
+      ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+      for (int i=0; i<lexicons.size(); i++) {
+        Lexicon lexicon = lexicons.get(i);
+        String lexiconName = lexicon.getName();
+        HashMap<String, DatabaseEntry> lexHashMap = getWholeLexiconHashMap(lexiconName);
+        Iterator<String> lexDumpIter = lexHashMap.keySet().iterator();
+        File outputFile = new File(DATA_FILES_DIR_LEXICA + "/" + lexiconName + ".xml");
+        out = new BufferedOutputStream(new FileOutputStream(outputFile));
+        write("<lexicon>\n", out);
+        write("<name>" + lexiconName + "</name>\n", out);
+        write("<description>" + lexicon.getDescription() + "</description>\n", out);
+        write("<entries>\n", out);
+        while (lexDumpIter.hasNext()) {
+          write("<entry>\n", out);
+          String lexKeyStr = lexDumpIter.next();
+          write("<form>" + lexKeyStr + "</form>\n", out);
+          DatabaseEntry lexValue = lexHashMap.get(lexKeyStr);
+          byte[] lexValueBytes = lexValue.getData();
+          write(lexValueBytes, out);
+          write("</entry>\n", out);
+        }
+        write("</entries>\n", out);
+        write("</lexicon>\n", out);
+      }
+    } catch (FileNotFoundException e) {
+      throw new ApplicationException(e);
+    } finally {
+      // always close the stream 
+      if (in != null) try { in.close(); } catch (Exception e) { }
+      if (out != null) try { out.close(); } catch (Exception e) { }
+    }
+  }
+  
+  private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException {
+    try {
+      out.write(inputBytes, 0, inputBytes.length);
+      out.flush();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void write(String outStr, BufferedOutputStream out) throws ApplicationException {
+    try {
+      byte[] bytes = outStr.getBytes("utf-8");
+      out.write(bytes, 0, bytes.length);
+      out.flush();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+
+}
\ No newline at end of file
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Wed, 24 Nov 2010 17:24:23 +0100
parents
children