diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,175 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.lex.db;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Date;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon;
+import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
+
+public class LexHandler {
+  private static LexHandler instance;
+  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR;
+  private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux";
+  private DbEnvLex dbEnvLexica;
+  private Date beginOfOperation;
+  private Date endOfOperation;
+  
+  public static LexHandler getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new LexHandler();
+      instance.initReadOnly();
+    }
+    return instance;
+  }
+
+  /**
+   * 
+   * @param formName
+   * @param language
+   * @return delivers lexical entries by help of the morphology component (lexical entry of the stem of the normalized word form)
+   * @throws ApplicationException
+   */
+  public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException {
+    ArrayList<String> lexEntryKeys = new ArrayList<String>();
+    MorphologyCache morphologyCache = MorphologyCache.getInstance();
+    ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize);
+    boolean hasLexEntry = false;
+    hasLexEntry = hasLexEntryKey(formName, language);
+    if (hasLexEntry)
+      lexEntryKeys.add(formName);
+    if (formLemmas != null) {
+      for (int j=0; j<formLemmas.size(); j++) {
+        Lemma l = formLemmas.get(j);
+        String lName = l.getLemmaName();
+        if (! hasLexEntry) {
+          hasLexEntry = hasLexEntryKey(lName, language);
+        }
+        if (! lName.equals(formName) && hasLexEntry) {
+          lexEntryKeys.add(lName);
+        }
+      }
+    }
+    if(lexEntryKeys.isEmpty())
+      return null;
+    else
+      return lexEntryKeys;
+  }
+  
+  public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
+    boolean hasLexEntry = false;
+    ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLexicons(language);
+    if (statLexicons != null) {
+      for (int i=0; i<statLexicons.size(); i++) {
+        Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries
+        LexiconEntry lexEntry = readEntry(lexicon.getName(), formName);
+        if (lexEntry != null) {
+          return true;
+        }
+      }
+    }
+    return hasLexEntry;
+  }
+  
+  public LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException {
+    LexiconEntry retLexEntry = null;
+    try {
+      String dbFoundValueStr = null;
+      String keyStr = formName;
+      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
+      Cursor cursor = lexDB.openCursor(null, null);
+      DatabaseEntry foundValue = new DatabaseEntry();
+      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+      if (operationStatus == OperationStatus.SUCCESS) {
+        byte[] foundValueBytes = foundValue.getData();
+        dbFoundValueStr = new String(foundValueBytes, "utf-8");
+      }
+      cursor.close();
+      if (dbFoundValueStr != null) {
+        retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
+      }
+    } catch (DatabaseException e) {
+      throw new ApplicationException(e);
+    } catch (UnsupportedEncodingException e) {
+      throw new ApplicationException(e);
+    }
+    return retLexEntry;
+  }
+  
+  public String transcode(String fromEncoding, String toEncoding, String inputStr) throws ApplicationException {
+    String encodedStr = null;
+    Transcoder transcoder = Transcoder.getInstance();
+    if (fromEncoding.equals("buckwalter") && toEncoding.equals("unicode")) {
+      encodedStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr);
+    } else if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) {
+      encodedStr = transcoder.transcodeFromBetaCode2Unicode(inputStr);
+    }
+    return encodedStr;
+  }
+  
+  public static void main(String[] args) throws ApplicationException {
+    getInstance();
+    instance.beginOperation();
+    System.out.print("Start ...");
+    instance.readSampleData();
+    instance.end();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    System.out.println("End.");
+    System.out.println("Needed time: " + elapsedTime + " seconds");
+  }
+
+  private void initReadOnly() throws ApplicationException {
+    dbEnvLexica = new DbEnvLex();
+    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
+    dbEnvLexica.initReadOnly();
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      dbEnvLexica.openDatabase(lexiconName);
+    }
+  }
+  
+  private void readSampleData() throws ApplicationException {
+    // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
+    String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj
+    String l2 = readEntry("ls", "laudabilis").getContent();  // latin
+    System.out.println("Autenrieth: autos: " + l1);
+    System.out.println("Lewis & Short: Laudabilis: " + l2);
+  }
+  
+  private void end() throws ApplicationException {
+    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
+    for (int i=0; i<lexicons.size(); i++) {
+      Lexicon lexicon = lexicons.get(i);
+      String lexiconName = lexicon.getName();
+      dbEnvLexica.closeDatabase(lexiconName);
+    }
+    dbEnvLexica.close();
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+
+}
\ No newline at end of file