Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,265 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; +import com.sleepycat.je.util.DbLoad; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; + +public class DBMorphSupWriter { + private static DBMorphSupWriter instance; + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DATA_FILES_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup"; + private static String DB_DIR_DONATUS_ADD_SUP = DATA_DIR + "/dataFiles/donatusAdditionalSup/db"; + private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"}; + private DbEnvMorphSup dbEnvMorphSup; + private Date beginOfOperation; + private Date endOfOperation; + + public static DBMorphSupWriter getInstance() throws ApplicationException { + if (instance == null) { + instance = new DBMorphSupWriter(); + } + return instance; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.initReadWrite(); + // instance.loadDonatusSupDbDumpsToDb(); + instance.printSizeOfAllMorphSupDBs(); + // instance.writeDonatusSupsToFiles(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadWrite() throws ApplicationException { + dbEnvMorphSup = new DbEnvMorphSup(); + dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP); + dbEnvMorphSup.initReadWrite(); + } + + private void loadDonatusSupDbDumpsToDb() throws ApplicationException { + for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { + String donatusSupName = DONATUS_SUP_DUMPS[i]; + loadDbDumpToDb(donatusSupName); + } + } + + private void loadDbDumpToDb(String donatusSupName) throws ApplicationException { + String dumpFileName = DATA_FILES_DIR_DONATUS_ADD_SUP + "/" + donatusSupName + ".dump"; + String dbName = donatusSupName + "Dump.db"; + try { + BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName)); + DbLoad loader = new DbLoad(); + loader.setEnv(dbEnvMorphSup.getEnv()); + loader.setDbName(dbName); + loader.setInputReader(bufferedReader); + loader.setIgnoreUnknownConfig(true); + loader.load(); + bufferedReader.close(); + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + } + + private void end() throws ApplicationException { + for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { + String donatusSupName = DONATUS_SUP_DUMPS[i]; + dbEnvMorphSup.closeDatabase(donatusSupName); + dbEnvMorphSup.closeDatabase(donatusSupName + "Dump"); + } + dbEnvMorphSup.close(); + } + + private String readEntry(String morphSupName, String formName) throws ApplicationException { + String retString = null; + try { + String keyStr = formName; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database morpgSupDB = dbEnvMorphSup.getMorphSupDB(morphSupName); + Cursor cursor = morpgSupDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + retString = new String(foundValueBytes, "utf-8"); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retString; + } + + private void printSizeOfAllMorphSupDBs() throws ApplicationException { + for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { + String donatusSupName = DONATUS_SUP_DUMPS[i]; + int size = getSizes(donatusSupName + "Dump"); + System.out.println(donatusSupName + ": " + size + " records"); + } + } + + private int getSizes(String donatusSupName) throws ApplicationException { + int size = 0; + try { + dbEnvMorphSup.openDatabase(donatusSupName); + Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName); + size = (int) morphDB.count(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } + return size; + } + + private HashMap<String, DatabaseEntry> getWholeMorphHashMap(String donatusSupName) throws ApplicationException { + HashMap<String, DatabaseEntry> morphHashMap = new HashMap<String, DatabaseEntry>(); + try { + dbEnvMorphSup.openDatabase(donatusSupName + "Dump"); + Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump"); + Cursor cursor = morphDB.openCursor(null, null); + DatabaseEntry dbEntryKey = new DatabaseEntry(); + DatabaseEntry dbEntryValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + while (operationStatus == OperationStatus.SUCCESS) { + int size = dbEntryKey.getSize(); + if (size > 0) { + byte[] dbEntryKeyBytes = dbEntryKey.getData(); + String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); + DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); + morphHashMap.put(dbEntryKeyStr, newDbEntryValue); + } + operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); + } + cursor.close(); + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return morphHashMap; + } + + private void writeDonatusSupsToFiles() throws ApplicationException { + BufferedReader in = null; + BufferedOutputStream out = null; + try { + for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { + String donatusSupName = DONATUS_SUP_DUMPS[i]; + HashMap<String, DatabaseEntry> morphHashMap = getWholeMorphHashMap(donatusSupName); + Iterator<String> morphDumpIter = morphHashMap.keySet().iterator(); + File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml"); + out = new BufferedOutputStream(new FileOutputStream(outputFile)); + write("<forms>\n", out); + while (morphDumpIter.hasNext()) { + write("<form>\n", out); + write("<provider>" + "donatus-sup" + "</provider>\n", out); + String language = "unknown"; + if (donatusSupName.startsWith("cache-")) + language = donatusSupName.substring(6); + write("<language>" + language + "</language>\n", out); + String morphKeyStr = morphDumpIter.next(); + String formStr = morphKeyStr; + if (language.equals("el")) + formStr = transcodeFromBetaCode2Unicode(formStr); + formStr = formStr.toLowerCase(); + write("<form-name>" + formStr + "</form-name>\n", out); + DatabaseEntry morphValue = morphHashMap.get(morphKeyStr); + byte[] morphValueBytes = morphValue.getData(); + String wholeLemmaStr = new String(morphValueBytes, "utf-8"); + // only first lemma is recognized TODO recognize all lemmas for the form + char splitSymbol = '\u0009'; + int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol); + String lemmaForm = wholeLemmaStr; + if (firstIndexOfSplitSymbol != -1) + lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol); + else + lemmaForm = lemmaForm + "XXXXXX"; + char splitSymbol2 = '\u000B'; + int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2); + if (firstIndexOfSplitSymbol2 != -1) + lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2); + if (language.equals("el")) + lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm); + lemmaForm = lemmaForm.replaceAll("#\\d", ""); + lemmaForm = lemmaForm.toLowerCase(); + write("<lemma-name>" + lemmaForm + "</lemma-name>\n", out); + write("</form>\n", out); + } + write("</forms>\n", out); + } + } catch (FileNotFoundException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } finally { + // always close the stream + if (in != null) try { in.close(); } catch (Exception e) { } + if (out != null) try { out.close(); } catch (Exception e) { } + } + } + + private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { + try { + out.write(inputBytes, 0, inputBytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void write(String outStr, BufferedOutputStream out) throws ApplicationException { + try { + byte[] bytes = outStr.getBytes("utf-8"); + out.write(bytes, 0, bytes.length); + out.flush(); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { + Transcoder transcoder = Transcoder.getInstance(); + String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr); + return encodedUnicodeForm; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file