view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children 7d6d969b10cf
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.morph.app;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;

import java.util.logging.Logger;

import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

public class MorphologyCache {
  private static MorphologyCache instance;
  private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName());
  private static String DATA_DIR = Constants.getInstance().getDataDir();
  private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus";
  public static int QUERY_MODE = 0;
  public static int DOCUMENT_MODE = 1;
  private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE;
  protected int mode = QUERY_MODE;
  private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>();  // cache of forms: hashKey is formName
  private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();  // cache of lemmas: hashKey is lemmaName
  private DBMorphHandler dbMorphHandlerStatic;  // handles static morph data (BerkeleyDB)
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static MorphologyCache getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new MorphologyCache();
      instance.init();
    }
    return instance;
  }

  private void init() throws ApplicationException {
    instance.beginOperation();
    dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS);
    dbMorphHandlerStatic.start();
    dbMorphHandlerStatic.openDatabases();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)");
  }
  
  public int getMode() {
    return mode;
  }
  
  public void setMode(int newMode) {
    this.mode = newMode;
  }
  
  public void end() throws ApplicationException {
    dbMorphHandlerStatic.closeDatabases();
    LOGGER.info("Morphology db cache: closed");
  }

  public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Lemma> retFormLemmas = null;
    String formName = formNameArg;
    if (normalize) {
      Normalizer normalizer = new Normalizer(language);
      formName = normalizer.normalize(formNameArg);
    }
    // first look in local cache
    String key = language + "###" + formName;
    Hashtable<String, Lemma> formLemmasHashtable = forms.get(key);
    if (formLemmasHashtable == null) {
      ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName);
      // put lemmas into local cache
      int localHashTableSize = forms.size();
      if (localHashTableSize >= MAX_HASHTABLE_SIZE) {
        clearCache();
      }
      if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) {
        formLemmasHashtable = new Hashtable<String, Lemma>();
        for (int i=0; i<dbFormLemmas.size(); i++) {
          Lemma lemma = dbFormLemmas.get(i);
          String lemmaName = lemma.getLemmaName();
          String lemmaKey = language + "###" + lemmaName;
          Lemma localLemma = lemmas.get(lemmaKey);
          if (localLemma == null) {
            ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName);
            lemma.setForms(lemmaForms);
            lemmas.put(lemmaKey, lemma);
          } else {
            lemma = localLemma;
          }
          formLemmasHashtable.put(lemmaKey, lemma);
        }
        forms.put(key, formLemmasHashtable);
      }
    } 
    retFormLemmas = new ArrayList<Lemma>();
    if (formLemmasHashtable != null) {
      Enumeration<String> formLemmasKeys = formLemmasHashtable.keys();
      while(formLemmasKeys.hasMoreElements()) {
        String lemmaKey = formLemmasKeys.nextElement();
        Lemma l = formLemmasHashtable.get(lemmaKey);
        retFormLemmas.add(l);
      }
    }
    Collections.sort(retFormLemmas);
    return retFormLemmas;
  }
  
  public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    String lemmaName = lemmaNameArg;
    if (normalize) {
      Normalizer normalizer = new Normalizer(language);
      lemmaName = normalizer.normalize(lemmaNameArg);
    }
    // first look in local cache
    String key = language + "###" + lemmaName;
    Lemma lemma = lemmas.get(key);
    if (lemma == null) {
      ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName);
      if (dbLemmaForms != null && dbLemmaForms.size() > 0) {
        lemma = new Lemma();
        lemma.setLemmaName(lemmaName);
        lemma.setLanguage(language);
        lemma.setProvider(dbLemmaForms.get(0).getProvider());
        lemma.setForms(dbLemmaForms);
        lemmas.put(lemmaName, lemma);
      }
    }
    return lemma;
  }
  
  public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Form> result = new ArrayList<Form>();
    luceneQueryString = luceneQueryString.toLowerCase();
    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
      for (int i=0; i<formsFromQuery.size(); i++) {
        String formStr = formsFromQuery.get(i);
        if (normalize) {
          Normalizer normalizer = new Normalizer(language);
          formStr = normalizer.normalize(formStr);
        }
        ArrayList<Lemma> formLemmas = null;
        // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched
        if (formStr.startsWith("lemmalemma")) {
          formLemmas = new ArrayList<Lemma>();
          String lemmaName = formStr.substring(10);
          Lemma lemma = getLemma(language, lemmaName, false);
          formLemmas.add(lemma);
        } else {
          formLemmas = getLemmasByFormName(language, formStr, false);
        }
        if (formLemmas != null && ! formLemmas.isEmpty()) {
          for (int j=0; j<formLemmas.size(); j++) {
            Lemma l = formLemmas.get(j);
            ArrayList<Form> lemmaForms = l.getFormsList();
            result.addAll(lemmaForms);
          }
        }
      }
    }
    return result;
  }

  public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();
    luceneQueryString = luceneQueryString.toLowerCase();
    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
      for (int i=0; i<formsFromQuery.size(); i++) {
        String formStr = formsFromQuery.get(i);
        if (normalize) {
          Normalizer normalizer = new Normalizer(language);
          formStr = normalizer.normalize(formStr);
        }
        ArrayList<Lemma> formLemmas = null;
        // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched
        if (formStr.startsWith("lemmalemma")) {
          formLemmas = new ArrayList<Lemma>();
          String lemmaName = formStr.substring(10);
          Lemma lemma = getLemma(language, lemmaName, false);
          formLemmas.add(lemma);
        } else {
          formLemmas = getLemmasByFormName(language, formStr, false);
        }
        if (formLemmas != null) {
          for (int j=0; j<formLemmas.size(); j++) {
            Lemma lemma = formLemmas.get(j);
            lemmas.put(lemma.getLemmaName(), lemma);
          }
        }
      }
    }
    ArrayList<Lemma> result = new ArrayList<Lemma>();
    if (lemmas != null) {
      Enumeration<String> formLemmasKeys = lemmas.keys();
      while(formLemmasKeys.hasMoreElements()) {
        String lemmaKey = formLemmasKeys.nextElement();
        Lemma l = lemmas.get(lemmaKey);
        result.add(l);
      }
    }
    Collections.sort(result);
    if (result.isEmpty())
      return null;
    else 
      return result;
  }
  
  public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    Hashtable<String, String> indexKeys = new Hashtable<String, String>();
    for (int j=0; j<lemmaNames.size(); j++) {
      String lemmaName = lemmaNames.get(j);
      Lemma lemma = getLemma(language, lemmaName, false);
      indexKeys.put(lemmaName, lemmaName);
      if (lemma != null) {
        ArrayList<Form> lemmaForms = lemma.getFormsList();
        for (int k=0; k<lemmaForms.size(); k++) {
          Form form = lemmaForms.get(k);
          ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false);
          if (fLemmas != null) {
            String indexKey = "";
            if (fLemmas.size() == 1) {
              indexKey = fLemmas.get(0).getLemmaName();
            } else {
              for (int l=0; l<fLemmas.size(); l++) {
                Lemma lem = fLemmas.get(l);
                indexKey = indexKey + "+++" + lem.getLemmaName(); 
              }
              indexKeys.put(indexKey, indexKey);
            }
          }
        }
      }
    }
    ArrayList<String> result = new ArrayList<String>();
    if (indexKeys != null) {
      Enumeration<String> indexKeysKeys = indexKeys.keys();
      while(indexKeysKeys.hasMoreElements()) {
        String indexKey = indexKeysKeys.nextElement();
        result.add(indexKey);
      }
    }
    Collections.sort(result);
    if (result.isEmpty())
      return null;
    else 
      return result;
  }
  
  private void clearCache() {
    forms = null;
    lemmas = null;
    forms = new Hashtable<String, Hashtable<String, Lemma>>();
    lemmas = new Hashtable<String, Lemma>(); 
  }

  private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName);
    return lemmasStatic;
  }

  private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName);
    return formsStatic;
  }
    
  private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
    LuceneUtil luceneUtil = LuceneUtil.getInstance();
    ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString);
    return variants;
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }
}