view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children 7d6d969b10cf
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.dict.db;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.logging.Logger;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;

import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder;

public class LexHandler {
  private static LexHandler instance;
  private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName());
  private static String DATA_DIR = Constants.getInstance().getDataDir(); 
  private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux";
  private DbEnvLex dbEnvLexica;
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static LexHandler getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new LexHandler();
      instance.initReadOnly();
    }
    return instance;
  }

  public void end() throws ApplicationException {
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.closeDatabase(lexiconName);
    }
    dbEnvLexica.close();
    LOGGER.info("Lexicon db cache: closed");
  }

  /** 
   * @param query
   * @param type
   * @param language
   * @param normalization
   * @return lemmas
   * @throws ApplicationException
   */
  public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException {
    ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>();
    // get lemmas of all forms in query
    MorphologyCache morphologyCache = MorphologyCache.getInstance();
    String[] queryForms = query.split(" ");
    for (int k=0; k<queryForms.length; k++) {
      String queryForm = queryForms[k];
      ArrayList<Lemma> lemmas = null;
      if (type.equals("form")) {
        if (normalization.equals("norm"))
          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true);
        else if (normalization.equals("none"))
          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false);
        else 
          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true);  // TODO reg and reg+norm
      } else if (type.equals("lemma")) {
        lemmas = new ArrayList<Lemma>();
        Lemma l = null;
        if (normalization.equals("norm"))
          l = morphologyCache.getLemma(language, queryForm, true);
        else if (normalization.equals("none"))
          l = morphologyCache.getLemma(language, queryForm, false);
        else 
          l = morphologyCache.getLemma(language, queryForm, true);
        if (l != null)
          lemmas.add(l);
      }
      if (lemmas != null && ! lemmas.isEmpty()) {
        lexLemmas.addAll(lemmas);
      } else {
        Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon
        lexLemmas.add(l);
      }
    }
    Collections.sort(lexLemmas);
    if (lexLemmas.isEmpty())
      return null;
    else 
      return lexLemmas;
  }

  public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException {
    ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>();
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language);
    if (lexiconName != null) {
      lexicons = new ArrayList<Lexicon>();
      Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName);
      if (lexicon != null)
        lexicons.add(lexicon);
    }
    if (lexicons != null) {
      for (int i=0; i<lexicons.size(); i++) {
        Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries
        for (int j=0; j<lexLemmas.size(); j++) {
          String lemmaName = lexLemmas.get(j).getLemmaName();
          if (Language.getInstance().isGerman(language) && lemmaName.contains("ae"))
            lemmaName = lemmaName.replaceAll("ae", "Š");
          if (Language.getInstance().isGerman(language) && lemmaName.contains("oe"))
            lemmaName = lemmaName.replaceAll("oe", "š");
          if (Language.getInstance().isGerman(language) && lemmaName.contains("ue"))
            lemmaName = lemmaName.replaceAll("ue", "Ÿ");
          if (Language.getInstance().isGerman(language) && lemmaName.contains("ss"))
            lemmaName = lemmaName.replaceAll("ss", "§");
          LexiconEntry lexEntry = getEntry(lexicon, lemmaName);
          if (lexEntry != null) {
            lexicon.addEntry(lexEntry); // add entries to the cloned lexicon
          }
        }
        if (! lexicon.isEmpty())
          retLexicons.add(lexicon);
      }
    }
    Collections.sort(retLexicons);
    return retLexicons;
  }

  /**
   * 
   * @param formName
   * @param language
   * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form)
   * @throws ApplicationException
   */
  public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException {
    ArrayList<String> lexEntryKeys = new ArrayList<String>();
    MorphologyCache morphologyCache = MorphologyCache.getInstance();
    ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize);
    boolean hasLexEntry = false;
    hasLexEntry = hasLexEntryKey(formName, language);
    if (hasLexEntry)
      lexEntryKeys.add(formName);
    if (formLemmas != null) {
      for (int j=0; j<formLemmas.size(); j++) {
        Lemma l = formLemmas.get(j);
        String lName = l.getLemmaName();
        if (! hasLexEntry) {
          hasLexEntry = hasLexEntryKey(lName, language);
        }
        if (language.equals("de") || language.equals("fr") || language.equals("nl"))   // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.)
          lexEntryKeys.add(lName);
        if (! lName.equals(formName) && hasLexEntry) {
          lexEntryKeys.add(lName);
        }
      }
    }
    if(lexEntryKeys.isEmpty())
      return null;
    else
      return lexEntryKeys;
  }
  
  public boolean hasLexEntryKey(String formName, String language) throws ApplicationException {
    boolean hasLexEntry = false;
    if (language.equals("zh"))   // each chinese character always has a lexicon entry
      return true;
    ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language);
    if (statLexicons != null) {
      for (int i=0; i<statLexicons.size(); i++) {
        Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries
        LexiconEntry lexEntry = readEntry(lexicon.getName(), formName);
        if (lexEntry != null) {
          return true;
        }
      }
    }
    return hasLexEntry;
  }
  
  public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException {
    int pageSize = 50;
    int from = (pageNumber * pageSize) - pageSize + 1;
    int to = pageNumber * pageSize;
    ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language);
    ArrayList<Lexicon> retLexicons = null;
    if (statLexicons != null) {
      for (int i=0; i<statLexicons.size(); i++) {
        Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries
        String lexiconName = lexicon.getName();
        ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); 
        // TODO merge the entries and remove duplicates
        if (lexEntries != null) {
          lexicon.addEntries(lexEntries);
          if (retLexicons == null)
            retLexicons = new ArrayList<Lexicon>();
          retLexicons.add(lexicon);
        }
      }
    }
    return retLexicons;
  }
  
  public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException {
    int pageSize = 50;
    int from = (pageNumber * pageSize) - pageSize + 1;
    int to = pageNumber * pageSize;
    Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone();
    ArrayList<Lexicon> retLexicons = null;
    if (lexicon != null) {
      ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to);
      if (lexEntries != null) {
        lexicon.addEntries(lexEntries);
        retLexicons = new ArrayList<Lexicon>();
        retLexicons.add(lexicon);
      }
    }
    return retLexicons;
  }

  private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException {
    LexiconEntry lexEntry = null;
    if (lexicon.isLocalLexicon()) {
      lexEntry = readEntry(lexicon.getName(), formName);
      String lexiconQueryUrl = lexicon.getQueryUrl();
      if (lexEntry != null && lexicon.getQueryUrl() != null) {
        String language = lexicon.getSourceLanguage();
        if (Language.getInstance().isGreek(language)) {
          formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName);
        } else if (Language.getInstance().isArabic(language)) {
          formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName);
        }
        lexEntry.setRemoteUrl(lexiconQueryUrl + formName);
      }
    } else { 
      lexEntry = lexicon.getDynamicEntry(formName);
    }
    return lexEntry;
  }
  
  private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException {
    LexiconEntry retLexEntry = null;
    try {
      String dbFoundValueStr = null;
      String keyStr = formName;
      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry foundValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
      if (operationStatus == OperationStatus.SUCCESS) {
        byte[] foundValueBytes = foundValue.getData();
        dbFoundValueStr = new String(foundValueBytes, "utf-8");
      }
      cursor.close();
      if (dbFoundValueStr != null) {
        retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
      }
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return retLexEntry;
  }
  
  private ArrayList<LexiconEntry> readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException {
    ArrayList<LexiconEntry> retLexEntries = new ArrayList<LexiconEntry>();;
    try {
      String dbFoundValueStr = null;
      String keyStr = formPrefix;
      DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
      Database lexDB = dbEnvLexica.getLexiconDB(lexiconName);
      Cursor cursor = lexDB.openCursor(null, null);
      DatabaseEntry foundValue = new DatabaseEntry();
      OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT);
      int counter = 1;
      while (operationStatus == OperationStatus.SUCCESS && counter <= to) {
        if (counter >= from) {
          byte[] foundValueBytes = foundValue.getData();
          dbFoundValueStr = new String(foundValueBytes, "utf-8");
          byte[] foundKeyBytes = dbEntryKey.getData();
          String dbFoundKeyStr = new String(foundKeyBytes, "utf-8");
          LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr);
          retLexEntries.add(lexEntry);
        }
        operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT);
        counter++;
      }
      cursor.close();
      if (retLexEntries.isEmpty()) {
        return null;
      }
    } catch (DatabaseException e) {
      throw new ApplicationException(e);
    } catch (UnsupportedEncodingException e) {
      throw new ApplicationException(e);
    }
    return retLexEntries;
  }

  public static void main(String[] args) throws ApplicationException {
    getInstance();
    instance.beginOperation();
    System.out.print("Start ...");
    instance.readSampleData();
    instance.end();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    System.out.println("End.");
    System.out.println("Needed time: " + elapsedTime + " seconds");
  }

  private void initReadOnly() throws ApplicationException {
    dbEnvLexica = new DbEnvLex();
    dbEnvLexica.setDataDir(DB_DIR_LEXICA);
    dbEnvLexica.initReadOnly();
    ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons();
    for (int i=0; i<lexicons.size(); i++) {
      Lexicon lexicon = lexicons.get(i);
      String lexiconName = lexicon.getName();
      dbEnvLexica.openDatabase(lexiconName);
    }
    LOGGER.info("Lexicon db cache: opened");
  }
  
  private void readSampleData() throws ApplicationException {
    // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames();
    String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj
    String l2 = readEntry("ls", "laudabilis").getContent();  // latin
    System.out.println("Autenrieth: autos: " + l1);
    System.out.println("Lewis & Short: Laudabilis: " + l2);
  }
  
  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }

}