Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/db/DBMorphSupWriter.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.db; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import com.sleepycat.je.util.DbLoad; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; public class DBMorphSupWriter { private static DBMorphSupWriter instance; private static String MPDL_DATA_DIR = MpdlConstants.MPDL_DATA_DIR; private static String DATA_FILES_DIR_DONATUS_ADD_SUP = MPDL_DATA_DIR + "/dataFiles/donatusAdditionalSup"; private static String DB_DIR_DONATUS_ADD_SUP = MPDL_DATA_DIR + "/dataFiles/donatusAdditionalSup/db"; private static String[] DONATUS_SUP_DUMPS = {"cache-la", "cache-el", "cache-it"}; private DbEnvMorphSup dbEnvMorphSup; private Date beginOfOperation; private Date endOfOperation; public static DBMorphSupWriter getInstance() throws ApplicationException { if (instance == null) { instance = new DBMorphSupWriter(); } return instance; } public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation(); System.out.print("Start ..."); instance.initReadWrite(); // instance.loadDonatusSupDbDumpsToDb(); instance.printSizeOfAllMorphSupDBs(); // instance.writeDonatusSupsToFiles(); instance.end(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); System.out.println("End."); System.out.println("Needed time: " + elapsedTime + " seconds"); } private void initReadWrite() throws ApplicationException { dbEnvMorphSup = new DbEnvMorphSup(); dbEnvMorphSup.setDataDir(DB_DIR_DONATUS_ADD_SUP); dbEnvMorphSup.initReadWrite(); } private void loadDonatusSupDbDumpsToDb() throws ApplicationException { for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { String donatusSupName = DONATUS_SUP_DUMPS[i]; loadDbDumpToDb(donatusSupName); } } private void loadDbDumpToDb(String donatusSupName) throws ApplicationException { String dumpFileName = DATA_FILES_DIR_DONATUS_ADD_SUP + "/" + donatusSupName + ".dump"; String dbName = donatusSupName + "Dump.db"; try { BufferedReader bufferedReader = new BufferedReader(new FileReader(dumpFileName)); DbLoad loader = new DbLoad(); loader.setEnv(dbEnvMorphSup.getEnv()); loader.setDbName(dbName); loader.setInputReader(bufferedReader); loader.setIgnoreUnknownConfig(true); loader.load(); bufferedReader.close(); } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } catch (DatabaseException e) { throw new ApplicationException(e); } } private void end() throws ApplicationException { for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { String donatusSupName = DONATUS_SUP_DUMPS[i]; dbEnvMorphSup.closeDatabase(donatusSupName); dbEnvMorphSup.closeDatabase(donatusSupName + "Dump"); } dbEnvMorphSup.close(); } private String readEntry(String morphSupName, String formName) throws ApplicationException { String retString = null; try { String keyStr = formName; DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); Database morpgSupDB = dbEnvMorphSup.getMorphSupDB(morphSupName); Cursor cursor = morpgSupDB.openCursor(null, null); DatabaseEntry foundValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); if (operationStatus == OperationStatus.SUCCESS) { byte[] foundValueBytes = foundValue.getData(); retString = new String(foundValueBytes, "utf-8"); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return retString; } private void printSizeOfAllMorphSupDBs() throws ApplicationException { for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { String donatusSupName = DONATUS_SUP_DUMPS[i]; int size = getSizes(donatusSupName + "Dump"); System.out.println(donatusSupName + ": " + size + " records"); } } private int getSizes(String donatusSupName) throws ApplicationException { int size = 0; try { dbEnvMorphSup.openDatabase(donatusSupName); Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName); size = (int) morphDB.count(); } catch (DatabaseException e) { throw new ApplicationException(e); } return size; } private HashMap<String, DatabaseEntry> getWholeMorphHashMap(String donatusSupName) throws ApplicationException { HashMap<String, DatabaseEntry> morphHashMap = new HashMap<String, DatabaseEntry>(); try { dbEnvMorphSup.openDatabase(donatusSupName + "Dump"); Database morphDB = dbEnvMorphSup.getMorphSupDB(donatusSupName + "Dump"); Cursor cursor = morphDB.openCursor(null, null); DatabaseEntry dbEntryKey = new DatabaseEntry(); DatabaseEntry dbEntryValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getFirst(dbEntryKey, dbEntryValue, LockMode.DEFAULT); while (operationStatus == OperationStatus.SUCCESS) { int size = dbEntryKey.getSize(); if (size > 0) { byte[] dbEntryKeyBytes = dbEntryKey.getData(); String dbEntryKeyStr = new String(dbEntryKeyBytes, "utf-8"); DatabaseEntry newDbEntryValue = new DatabaseEntry(dbEntryValue.getData()); morphHashMap.put(dbEntryKeyStr, newDbEntryValue); } operationStatus = cursor.getNext(dbEntryKey, dbEntryValue, LockMode.DEFAULT); } cursor.close(); } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return morphHashMap; } private void writeDonatusSupsToFiles() throws ApplicationException { BufferedReader in = null; BufferedOutputStream out = null; try { for (int i=0; i<DONATUS_SUP_DUMPS.length; i++) { String donatusSupName = DONATUS_SUP_DUMPS[i]; HashMap<String, DatabaseEntry> morphHashMap = getWholeMorphHashMap(donatusSupName); Iterator<String> morphDumpIter = morphHashMap.keySet().iterator(); File outputFile = new File(DATA_FILES_DIR_DONATUS_ADD_SUP + "/donatus-sup-" + donatusSupName + ".xml"); out = new BufferedOutputStream(new FileOutputStream(outputFile)); write("<forms>\n", out); while (morphDumpIter.hasNext()) { write("<form>\n", out); write("<provider>" + "donatus-sup" + "</provider>\n", out); String language = "unknown"; if (donatusSupName.startsWith("cache-")) language = donatusSupName.substring(6); write("<language>" + language + "</language>\n", out); String morphKeyStr = morphDumpIter.next(); String formStr = morphKeyStr; if (language.equals("el")) formStr = transcodeFromBetaCode2Unicode(formStr); formStr = formStr.toLowerCase(); write("<form-name>" + formStr + "</form-name>\n", out); DatabaseEntry morphValue = morphHashMap.get(morphKeyStr); byte[] morphValueBytes = morphValue.getData(); String wholeLemmaStr = new String(morphValueBytes, "utf-8"); // only first lemma is recognized TODO recognize all lemmas for the form char splitSymbol = '\u0009'; int firstIndexOfSplitSymbol = wholeLemmaStr.indexOf(splitSymbol); String lemmaForm = wholeLemmaStr; if (firstIndexOfSplitSymbol != -1) lemmaForm = wholeLemmaStr.substring(0, firstIndexOfSplitSymbol); else lemmaForm = lemmaForm + "XXXXXX"; char splitSymbol2 = '\u000B'; int firstIndexOfSplitSymbol2 = lemmaForm.indexOf(splitSymbol2); if (firstIndexOfSplitSymbol2 != -1) lemmaForm = lemmaForm.substring(0, firstIndexOfSplitSymbol2); if (language.equals("el")) lemmaForm = transcodeFromBetaCode2Unicode(lemmaForm); lemmaForm = lemmaForm.replaceAll("#\\d", ""); lemmaForm = lemmaForm.toLowerCase(); write("<lemma-name>" + lemmaForm + "</lemma-name>\n", out); write("</form>\n", out); } write("</forms>\n", out); } } catch (FileNotFoundException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } finally { // always close the stream if (in != null) try { in.close(); } catch (Exception e) { } if (out != null) try { out.close(); } catch (Exception e) { } } } private void write(byte[] inputBytes, BufferedOutputStream out) throws ApplicationException { try { out.write(inputBytes, 0, inputBytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private void write(String outStr, BufferedOutputStream out) throws ApplicationException { try { byte[] bytes = outStr.getBytes("utf-8"); out.write(bytes, 0, bytes.length); out.flush(); } catch (IOException e) { throw new ApplicationException(e); } } private String transcodeFromBetaCode2Unicode(String inputStr) throws ApplicationException { Transcoder transcoder = Transcoder.getInstance(); String encodedUnicodeForm = transcoder.transcodeFromBetaCode2Unicode(inputStr); return encodedUnicodeForm; } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }