view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.lex.db;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;

import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder;
import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica;
import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon;
import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;

public class LexHandler {
  private static LexHandler instance;
  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR;
  private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux";
  private DbEnvLex dbEnvLexica;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static LexHandler getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new LexHandler();
      instance.initReadOnly();
    }
    return instance;
  }

  /**
   * 
   * @param formName
   * @param language
   * @return delivers lexical entries by help of the morphology component (lexical entry of the stem of the normalized word form)
   * @throws ApplicationException
   */
  public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException {
    ArrayList<String> lexEntryKeys = new ArrayList<String>();
    MorphologyCache morphologyCache = MorphologyCache.getInstance();
    ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize);
    boolean hasLexEntry = false;
    hasLexEntry = hasLexEntryKey(formName, language);
    if (hasLexEntry)
      lexEntryKeys.add(formName);
    if (formLemmas != null) {
      for (int j=0; j<formLemmas.size(); j++) {
        Lemma l = formLemmas.get(j);
        String lName = l.getLemmaName();
        if (! hasLexEntry) {
          hasLexEntry = hasLexEntryKey(lName, language);
        }
        if (language.equals("de") || language.equals("fr") || language.equals("nl"))   // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für frund nl  auch eine bessere Morph.) und dann diese Zeilen wieder löschen
          lexEntryKeys.add(lName);
        if (! lName.equals(formName) && hasLexEntry) {
          lexEntryKeys.add(lName);
        }
      }
    }
    if(lexEntryKeys.isEmpty())
      return null;
    else
      return lexEntryKeys;
  }
  
  public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
    boolean hasLexEntry = false;
    if (language.equals("zh"))   // jedes chin. einzelne Zeichen hat autom. immer einen Lexikoneintrag
      return true;
    ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLexicons(language);
    if (statLexicons != null) {
      for (int i=0; i<statLexicons.size(); i++) {
        Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries
        LexiconEntry lexEntry = readEntry(lexicon.getName(), formName);
        if (lexEntry != null) {
          return true;
        }
      }
    }
    return hasLexEntry;
  }
  
  public LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException {
    LexiconEntry retLexEntry = null;
    try {
      String dbFoundValueStr = null;
      String keyStr = formName;
      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry foundValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
      if (operationStatus == OperationStatus.SUCCESS) {
        byte[] foundValueBytes = foundValue.getData();
        dbFoundValueStr = new String(foundValueBytes, "utf-8");
      }
      cursor.close();
      if (dbFoundValueStr != null) {
        retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
      }
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return retLexEntry;
  }
  
  public String transcode(String fromEncoding, String toEncoding, String inputStr) throws ApplicationException {
    String encodedStr = null;
    Transcoder transcoder = Transcoder.getInstance();
    if (fromEncoding.equals("buckwalter") && toEncoding.equals("unicode")) {
      encodedStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr);
    } else if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) {
      encodedStr = transcoder.transcodeFromBetaCode2Unicode(inputStr);
    } else if (fromEncoding.equals("unicode") && toEncoding.equals("betacode")) {
      encodedStr = transcoder.transcodeFromUnicode2BetaCode(inputStr);
    } else if (fromEncoding.equals("unicode") && toEncoding.equals("buckwalter")) {
      encodedStr = transcoder.transcodeFromUnicode2Buckwalter(inputStr);
    }
    return encodedStr;
  }
  
  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    instance.readSampleData();
    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void initReadOnly() throws ApplicationException {
    dbEnvLexica = new DbEnvLex();
    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
    dbEnvLexica.initReadOnly();
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.openDatabase(lexiconName);
    }
  }
  
  private void readSampleData() throws ApplicationException {
    // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
    String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj
    String l2 = readEntry("ls", "laudabilis").getContent();  // latin
    System.out.println("Autenrieth: autos: " + l1);
    System.out.println("Lewis & Short: Laudabilis: " + l2);
  }
  
  private void end() throws ApplicationException {
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.closeDatabase(lexiconName);
    }
    dbEnvLexica.close();
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}