view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.morph.db;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.util.DbLoad;

import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder;

public class DBMorphSupWriter {
  private static DBMorphSupWriter instance;
  private static String DATA_DIR = Constants.getInstance().getDataDir();
  private static String DATA_FILES_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup";
  private static String DB_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup/db";
  private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"};
  private DbEnvMorphSup dbEnvMorphSup;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static DBMorphSupWriter getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new DBMorphSupWriter();
    }
    return instance;
  }

  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    instance.initReadWrite();
    // instance.loadDonatusSupDbDumpsToDb();
    instance.printSizeOfAllMorphSupDBs();
    // instance.writeDonatusSupsToFiles();
    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void initReadWrite() throws ApplicationException {
    dbEnvMorphSup = new DbEnvMorphSup();
    dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP);
    dbEnvMorphSup.initReadWrite();
  }
  
  private void loadDonatusSupDbDumpsToDb() throws ApplicationException {
    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
      String donatusSupName = DONATUS_SUP_DUMPS[i];
      loadDbDumpToDb(donatusSupName);
    }
  }
  
  private void loadDbDumpToDb(String donatusSupName) throws ApplicationException {
    String dumpFileName = DATA_FILES_DIR_DONATUS_ADD_SUP + "/" + donatusSupName + ".dump";
    String dbName = donatusSupName + "Dump.db";
    try {
      BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName));
      DbLoad loader = new DbLoad();
      loader.setEnv(dbEnvMorphSup.getEnv());
      loader.setDbName(dbName);
      loader.setInputReader(bufferedReader);
      loader.setIgnoreUnknownConfig(true);
      loader.load();
      bufferedReader.close();
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void end() throws ApplicationException {
    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
       String donatusSupName = DONATUS_SUP_DUMPS[i];
       dbEnvMorphSup.closeDatabase(donatusSupName);
       dbEnvMorphSup.closeDatabase(donatusSupName + "Dump");
    }
    dbEnvMorphSup.close();
  }

  private String readEntry(String morphSupName, String formName) throws ApplicationException {
    String retString = null;
    try {
      String keyStr = formName;
      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
      Database morpgSupDB = dbEnvMorphSup.getMorphSupDB(morphSupName);
      Cursor cursor = morpgSupDB.openCursor(null, null);
      DatabaseEntry foundValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
      if (operationStatus == OperationStatus.SUCCESS) {
        byte[] foundValueBytes = foundValue.getData();
        retString = new String(foundValueBytes, "utf-8");
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return retString;
  }
  
  private void printSizeOfAllMorphSupDBs() throws ApplicationException {
    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
      String donatusSupName = DONATUS_SUP_DUMPS[i];
      int size = getSizes(donatusSupName + "Dump");
      System.out.println(donatusSupName + ": " + size + " records");
    }
  }
  
  private int getSizes(String donatusSupName) throws ApplicationException {
    int size = 0;
    try {
      dbEnvMorphSup.openDatabase(donatusSupName);
      Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName);
      size = (int) morphDB.count();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    }
    return size;
  }
  
  private HashMap<String, DatabaseEntry> getWholeMorphHashMap(String donatusSupName) throws ApplicationException {
    HashMap<String, DatabaseEntry> morphHashMap = new HashMap<String, DatabaseEntry>();
    try {
      dbEnvMorphSup.openDatabase(donatusSupName + "Dump");
      Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump");
      Cursor cursor = morphDB.openCursor(null, null);
      DatabaseEntry dbEntryKey = new DatabaseEntry();
      DatabaseEntry dbEntryValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      while (operationStatus == OperationStatus.SUCCESS) {
        int size = dbEntryKey.getSize();
        if (size > 0) {
          byte[] dbEntryKeyBytes = dbEntryKey.getData();
          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
          DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData());
          morphHashMap.put(dbEntryKeyStr, newDbEntryValue);
        }
        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
      }
      cursor.close();
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return morphHashMap;
  }
  
  private void writeDonatusSupsToFiles() throws ApplicationException {
    BufferedReader in = null;
    BufferedOutputStream out = null;
    try {
      for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
        String donatusSupName = DONATUS_SUP_DUMPS[i];
        HashMap<String, DatabaseEntry> morphHashMap = getWholeMorphHashMap(donatusSupName);
        Iterator<String> morphDumpIter = morphHashMap.keySet().iterator();
        File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml");
        out = new BufferedOutputStream(new FileOutputStream(outputFile));
        write("<forms>\n", out);
        while (morphDumpIter.hasNext()) {
          write("<form>\n", out);
          write("<provider>" + "donatus-sup" + "</provider>\n", out);
          String language = "unknown";     
          if (donatusSupName.startsWith("cache-"))
            language = donatusSupName.substring(6);
          write("<language>" + language + "</language>\n", out);
          String morphKeyStr = morphDumpIter.next();
          String formStr = morphKeyStr;
          if (language.equals("el"))
            formStr = transcodeFromBetaCode2Unicode(formStr);
          formStr = formStr.toLowerCase();
          write("<form-name>" + formStr + "</form-name>\n", out);
          DatabaseEntry morphValue = morphHashMap.get(morphKeyStr);
          byte[] morphValueBytes = morphValue.getData();
          String wholeLemmaStr = new String(morphValueBytes, "utf-8");
          // only first lemma is recognized TODO recognize all lemmas for the form
          char splitSymbol = '\u0009';
          int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol);
          String lemmaForm = wholeLemmaStr;
          if (firstIndexOfSplitSymbol != -1)
            lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol);
          else
            lemmaForm = lemmaForm + "XXXXXX";
          char splitSymbol2 = '\u000B';
          int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2);
          if (firstIndexOfSplitSymbol2 != -1)
            lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2);
          if (language.equals("el"))
            lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm);
          lemmaForm = lemmaForm.replaceAll("#\\d", "");
          lemmaForm = lemmaForm.toLowerCase();
          write("<lemma-name>" + lemmaForm + "</lemma-name>\n", out);
          write("</form>\n", out);
        }
        write("</forms>\n", out);
      }
    } catch (FileNotFoundException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    } finally {
      // always close the stream 
      if (in != null) try { in.close(); } catch (Exception e) { }
      if (out != null) try { out.close(); } catch (Exception e) { }
    }
  }
  
  private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException {
    try {
      out.write(inputBytes, 0, inputBytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  private void write(String outStr, BufferedOutputStream out) throws ApplicationException {
    try {
      byte[] bytes = outStr.getBytes("utf-8");
      out.write(bytes, 0, bytes.length);
      out.flush();
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException {
    Transcoder transcoder = Transcoder.getInstance();
    String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr);
    return encodedUnicodeForm;
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}