diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java	Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,265 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.morph.db;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+import com.sleepycat.je.util.DbLoad;
+
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder;
+
+public class DBMorphSupWriter {
+  private static DBMorphSupWriter instance;
+  private static String DATA_DIR = Constants.getInstance().getDataDir();
+  private static String DATA_FILES_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup";
+  private static String DB_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup/db";
+  private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"};
+  private DbEnvMorphSup dbEnvMorphSup;
+  private Date beginOfOperation;
+  private Date endOfOperation;
+  
+  public static DBMorphSupWriter getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new DBMorphSupWriter();
+    }
+    return instance;
+  }
+
+  public static void main(String[] args) throws ApplicationException {
+    getInstance();
+    instance.beginOperation();
+    System.out.print("Start ...");
+    instance.initReadWrite();
+    // instance.loadDonatusSupDbDumpsToDb();
+    instance.printSizeOfAllMorphSupDBs();
+    // instance.writeDonatusSupsToFiles();
+    instance.end();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    System.out.println("End.");
+    System.out.println("Needed time: " + elapsedTime + " seconds");
+  }
+
+  private void initReadWrite() throws ApplicationException {
+    dbEnvMorphSup = new DbEnvMorphSup();
+    dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP);
+    dbEnvMorphSup.initReadWrite();
+  }
+  
+  private void loadDonatusSupDbDumpsToDb() throws ApplicationException {
+    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
+      String donatusSupName = DONATUS_SUP_DUMPS[i];
+      loadDbDumpToDb(donatusSupName);
+    }
+  }
+  
+  private void loadDbDumpToDb(String donatusSupName) throws ApplicationException {
+    String dumpFileName = DATA_FILES_DIR_DONATUS_ADD_SUP + "/" + donatusSupName + ".dump";
+    String dbName = donatusSupName + "Dump.db";
+    try {
+      BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName));
+      DbLoad loader = new DbLoad();
+      loader.setEnv(dbEnvMorphSup.getEnv());
+      loader.setDbName(dbName);
+      loader.setInputReader(bufferedReader);
+      loader.setIgnoreUnknownConfig(true);
+      loader.load();
+      bufferedReader.close();
+    } catch (FileNotFoundException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private void end() throws ApplicationException {
+    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
+       String donatusSupName = DONATUS_SUP_DUMPS[i];
+       dbEnvMorphSup.closeDatabase(donatusSupName);
+       dbEnvMorphSup.closeDatabase(donatusSupName + "Dump");
+    }
+    dbEnvMorphSup.close();
+  }
+
+  private String readEntry(String morphSupName, String formName) throws ApplicationException {
+    String retString = null;
+    try {
+      String keyStr = formName;
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      Database morpgSupDB = dbEnvMorphSup.getMorphSupDB(morphSupName);
+      Cursor cursor = morpgSupDB.openCursor(null, null);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      if (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        retString = new String(foundValueBytes, "utf-8");
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return retString;
+  }
+  
+  private void printSizeOfAllMorphSupDBs() throws ApplicationException {
+    for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
+      String donatusSupName = DONATUS_SUP_DUMPS[i];
+      int size = getSizes(donatusSupName + "Dump");
+      System.out.println(donatusSupName + ": " + size + " records");
+    }
+  }
+  
+  private int getSizes(String donatusSupName) throws ApplicationException {
+    int size = 0;
+    try {
+      dbEnvMorphSup.openDatabase(donatusSupName);
+      Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName);
+      size = (int) morphDB.count();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    }
+    return size;
+  }
+  
+  private HashMap<String, DatabaseEntry> getWholeMorphHashMap(String donatusSupName) throws ApplicationException {
+    HashMap<String, DatabaseEntry> morphHashMap = new HashMap<String, DatabaseEntry>();
+    try {
+      dbEnvMorphSup.openDatabase(donatusSupName + "Dump");
+      Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump");
+      Cursor cursor = morphDB.openCursor(null, null);
+      DatabaseEntry dbEntryKey = new DatabaseEntry();
+      DatabaseEntry dbEntryValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      while (operationStatus == OperationStatus.SUCCESS) {
+        int size = dbEntryKey.getSize();
+        if (size > 0) {
+          byte[] dbEntryKeyBytes = dbEntryKey.getData();
+          String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8");
+          DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData());
+          morphHashMap.put(dbEntryKeyStr, newDbEntryValue);
+        }
+        operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT);
+      }
+      cursor.close();
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return morphHashMap;
+  }
+  
+  private void writeDonatusSupsToFiles() throws ApplicationException {
+    BufferedReader in = null;
+    BufferedOutputStream out = null;
+    try {
+      for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) {
+        String donatusSupName = DONATUS_SUP_DUMPS[i];
+        HashMap<String, DatabaseEntry> morphHashMap = getWholeMorphHashMap(donatusSupName);
+        Iterator<String> morphDumpIter = morphHashMap.keySet().iterator();
+        File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml");
+        out = new BufferedOutputStream(new FileOutputStream(outputFile));
+        write("<forms>\n", out);
+        while (morphDumpIter.hasNext()) {
+          write("<form>\n", out);
+          write("<provider>" + "donatus-sup" + "</provider>\n", out);
+          String language = "unknown";     
+          if (donatusSupName.startsWith("cache-"))
+            language = donatusSupName.substring(6);
+          write("<language>" + language + "</language>\n", out);
+          String morphKeyStr = morphDumpIter.next();
+          String formStr = morphKeyStr;
+          if (language.equals("el"))
+            formStr = transcodeFromBetaCode2Unicode(formStr);
+          formStr = formStr.toLowerCase();
+          write("<form-name>" + formStr + "</form-name>\n", out);
+          DatabaseEntry morphValue = morphHashMap.get(morphKeyStr);
+          byte[] morphValueBytes = morphValue.getData();
+          String wholeLemmaStr = new String(morphValueBytes, "utf-8");
+          // only first lemma is recognized TODO recognize all lemmas for the form
+          char splitSymbol = '\u0009';
+          int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol);
+          String lemmaForm = wholeLemmaStr;
+          if (firstIndexOfSplitSymbol != -1)
+            lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol);
+          else
+            lemmaForm = lemmaForm + "XXXXXX";
+          char splitSymbol2 = '\u000B';
+          int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2);
+          if (firstIndexOfSplitSymbol2 != -1)
+            lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2);
+          if (language.equals("el"))
+            lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm);
+          lemmaForm = lemmaForm.replaceAll("#\\d", "");
+          lemmaForm = lemmaForm.toLowerCase();
+          write("<lemma-name>" + lemmaForm + "</lemma-name>\n", out);
+          write("</form>\n", out);
+        }
+        write("</forms>\n", out);
+      }
+    } catch (FileNotFoundException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    } finally {
+      // always close the stream 
+      if (in != null) try { in.close(); } catch (Exception e) { }
+      if (out != null) try { out.close(); } catch (Exception e) { }
+    }
+  }
+  
+  private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException {
+    try {
+      out.write(inputBytes, 0, inputBytes.length);
+      out.flush();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+
+  private void write(String outStr, BufferedOutputStream out) throws ApplicationException {
+    try {
+      byte[] bytes = outStr.getBytes("utf-8");
+      out.write(bytes, 0, bytes.length);
+      out.flush();
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException {
+    Transcoder transcoder = Transcoder.getInstance();
+    String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr);
+    return encodedUnicodeForm;
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+
+}
\ No newline at end of file