view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 10:59ff47d1e237

TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Fri, 11 Mar 2011 13:33:26 +0100
parents 408254cf2f1d
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.morph.app;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;

import org.apache.log4j.Logger;

import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler;
import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil;
import de.mpg.mpiwg.berlin.mpdl.util.FileUtil;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;

public class MorphologyCache {
  private static MorphologyCache instance;
  private static Logger LOGGER = Logger.getLogger(MorphologyCache.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log
  private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR;
  private static String DB_DIR_DONATUS = MPDL_DATA_DIR + "/dataBerkeleyDB/donatus";
  private static String DB_DIR_DYNAMIC = MPDL_DATA_DIR + "/dataBerkeleyDB/dynamic"; 
  private static String DATA_FILES_DIR = MPDL_DATA_DIR + "/dataFiles"; 
  private static String DATA_FILE_DYNAMIC_FORMS = DATA_FILES_DIR + "/snowball-all-forms.xml";
  public static int QUERY_MODE = 0;
  public static int DOCUMENT_MODE = 1;
  private static int MAX_HASHTABLE_SIZE = MpdlConstants.MORPHOLOGY_CACHE_SIZE;
  protected int mode = QUERY_MODE;
  private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>();  // cache of forms: hashKey is formName
  private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();  // cache of lemmas: hashKey is lemmaName
  private DBMorphHandler dbMorphHandlerStatic;  // handles static morph data (BerkeleyDB)
  private DBMorphHandler dbMorphHandlerDynamic; // handles dynamic morph data (BerkeleyDB)
  private OutputStream outputStreamDynamicForms;  // backup file for all dynamic forms
  private Date beginOfOperation;
  private Date endOfOperation;
  
  public static MorphologyCache getInstance() throws ApplicationException {
    if (instance == null) {
      instance = new MorphologyCache();
      instance.init();
    }
    return instance;
  }

  private void init() throws ApplicationException {
    LOGGER.info("Mpdl: Init morphology cache ...");
    instance.beginOperation();
    dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS);
    dbMorphHandlerStatic.start();
    dbMorphHandlerStatic.openDatabases();
    dbMorphHandlerDynamic = new DBMorphHandler(DB_DIR_DYNAMIC);
    dbMorphHandlerDynamic.start();
    dbMorphHandlerDynamic.openDatabases();
    openDynamicFormsDataFile();
    instance.endOperation();
    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
    LOGGER.info(" Needed time: " + elapsedTime + " seconds.");
  }
  
  public int getMode() {
    return mode;
  }
  
  public void setMode(int newMode) {
    this.mode = newMode;
  }
  
  public void end() throws ApplicationException {
    dbMorphHandlerStatic.closeDatabases();
    dbMorphHandlerDynamic.closeDatabases();
    closeDynamicFormsDataFile();
  }

  public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Lemma> retFormLemmas = null;
    String formName = formNameArg;
    if (normalize) {
      MpdlNormalizer normalizer = new MpdlNormalizer(language);
      formName = normalizer.normalize(formNameArg);
    }
    // first look in local cache
    String key = language + "###" + formName;
    Hashtable<String, Lemma> formLemmasHashtable = forms.get(key);
    if (formLemmasHashtable == null) {
      ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName);
      // put lemmas into local cache
      int localHashTableSize = forms.size();
      if (localHashTableSize >= MAX_HASHTABLE_SIZE) {
        clearCache();
      }
      if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) {
        formLemmasHashtable = new Hashtable<String, Lemma>();
        for (int i=0; i<dbFormLemmas.size(); i++) {
          Lemma lemma = dbFormLemmas.get(i);
          String lemmaName = lemma.getLemmaName();
          String lemmaKey = language + "###" + lemmaName;
          Lemma localLemma = lemmas.get(lemmaKey);
          if (localLemma == null) {
            ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName);
            lemma.setForms(lemmaForms);
            lemmas.put(lemmaKey, lemma);
          } else {
            lemma = localLemma;
          }
          formLemmasHashtable.put(lemmaKey, lemma);
        }
        forms.put(key, formLemmasHashtable);
      }
    } 
    retFormLemmas = new ArrayList<Lemma>();
    if (formLemmasHashtable != null) {
      Enumeration<String> formLemmasKeys = formLemmasHashtable.keys();
      while(formLemmasKeys.hasMoreElements()) {
        String lemmaKey = formLemmasKeys.nextElement();
        Lemma l = formLemmasHashtable.get(lemmaKey);
        retFormLemmas.add(l);
      }
    }
    Collections.sort(retFormLemmas);
    return retFormLemmas;
  }
  
  public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    String lemmaName = lemmaNameArg;
    if (normalize) {
      MpdlNormalizer normalizer = new MpdlNormalizer(language);
      lemmaName = normalizer.normalize(lemmaNameArg);
    }
    // first look in local cache
    String key = language + "###" + lemmaName;
    Lemma lemma = lemmas.get(key);
    if (lemma == null) {
      ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName);
      if (dbLemmaForms != null && dbLemmaForms.size() > 0) {
        lemma = new Lemma();
        lemma.setLemmaName(lemmaName);
        lemma.setLanguage(language);
        lemma.setProvider(dbLemmaForms.get(0).getProvider());
        lemma.setForms(dbLemmaForms);
        lemmas.put(lemmaName, lemma);
      }
    }
    return lemma;
  }
  
  public void insertFormDynamic(Form newFlatForm) throws ApplicationException {
    if (! newFlatForm.isOk())
      return;
    String provider = newFlatForm.getProvider();
    String lang = newFlatForm.getLanguage();
    String language = Language.getInstance().getLanguageId(lang);
    String lemmaName = newFlatForm.getLemmaName();
    Lemma newFlatLemma = new Lemma(provider, language, lemmaName);
    newFlatLemma.addForm(newFlatForm);
    // write to berkeley db; there is no test if the form is already contained (has to be done before)
    writeFormLemmaDynamic(newFlatForm, newFlatLemma);
    // write to backup file
    String formsXmlStr = newFlatForm.getXmlString();
    writeToDynamicFile(formsXmlStr);
    // fill local cache with new form if it is not too full
    int localHashTableSize = forms.size();
    if (localHashTableSize >= MAX_HASHTABLE_SIZE) {
      clearCache();
    }
    String lemmaKey = language + "###" + lemmaName;
    Lemma localLemma = lemmas.get(lemmaKey);
    if (localLemma == null) {
      lemmas.put(lemmaKey, newFlatLemma);
    } else {
      localLemma.addForm(newFlatForm);
      String formName = newFlatForm.getFormName();
      String formKey = language + "###" + formName;
      Hashtable<String, Lemma> formLemmas = forms.get(formKey);
      if (formLemmas == null) {
        formLemmas = new Hashtable<String, Lemma>();
        formLemmas.put(lemmaKey, localLemma);
        forms.put(formKey, formLemmas);
      } else {
        formLemmas.put(formKey, localLemma);
      }
    }
  }
  
  public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Form> result = new ArrayList<Form>();
    luceneQueryString = luceneQueryString.toLowerCase();
    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
      for (int i=0; i<formsFromQuery.size(); i++) {
        String formStr = formsFromQuery.get(i);
        if (normalize) {
          MpdlNormalizer normalizer = new MpdlNormalizer(language);
          formStr = normalizer.normalize(formStr);
        }
        ArrayList<Lemma> formLemmas = null;
        // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched
        if (formStr.startsWith("lemmalemma")) {
          formLemmas = new ArrayList<Lemma>();
          String lemmaName = formStr.substring(10);
          Lemma lemma = getLemma(language, lemmaName, false);
          formLemmas.add(lemma);
        } else {
          formLemmas = getLemmasByFormName(language, formStr, false);
        }
        if (formLemmas != null && ! formLemmas.isEmpty()) {
          for (int j=0; j<formLemmas.size(); j++) {
            Lemma l = formLemmas.get(j);
            ArrayList<Form> lemmaForms = l.getFormsList();
            result.addAll(lemmaForms);
          }
        }
      }
    }
    return result;
  }

  public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();
    luceneQueryString = luceneQueryString.toLowerCase();
    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
      for (int i=0; i<formsFromQuery.size(); i++) {
        String formStr = formsFromQuery.get(i);
        if (normalize) {
          MpdlNormalizer normalizer = new MpdlNormalizer(language);
          formStr = normalizer.normalize(formStr);
        }
        ArrayList<Lemma> formLemmas = null;
        // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched
        if (formStr.startsWith("lemmalemma")) {
          formLemmas = new ArrayList<Lemma>();
          String lemmaName = formStr.substring(10);
          Lemma lemma = getLemma(language, lemmaName, false);
          formLemmas.add(lemma);
        } else {
          formLemmas = getLemmasByFormName(language, formStr, false);
        }
        if (formLemmas != null) {
          for (int j=0; j<formLemmas.size(); j++) {
            Lemma lemma = formLemmas.get(j);
            lemmas.put(lemma.getLemmaName(), lemma);
          }
        }
      }
    }
    ArrayList<Lemma> result = new ArrayList<Lemma>();
    if (lemmas != null) {
      Enumeration<String> formLemmasKeys = lemmas.keys();
      while(formLemmasKeys.hasMoreElements()) {
        String lemmaKey = formLemmasKeys.nextElement();
        Lemma l = lemmas.get(lemmaKey);
        result.add(l);
      }
    }
    Collections.sort(result);
    if (result.isEmpty())
      return null;
    else 
      return result;
  }
  
  public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    Hashtable<String, String> indexKeys = new Hashtable<String, String>();
    for (int j=0; j<lemmaNames.size(); j++) {
      String lemmaName = lemmaNames.get(j);
      Lemma lemma = getLemma(language, lemmaName, false);
      indexKeys.put(lemmaName, lemmaName);
      if (lemma != null) {
        ArrayList<Form> lemmaForms = lemma.getFormsList();
        for (int k=0; k<lemmaForms.size(); k++) {
          Form form = lemmaForms.get(k);
          ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false);
          if (fLemmas != null) {
            String indexKey = "";
            if (fLemmas.size() == 1) {
              indexKey = fLemmas.get(0).getLemmaName();
            } else {
              for (int l=0; l<fLemmas.size(); l++) {
                Lemma lem = fLemmas.get(l);
                indexKey = indexKey + "+++" + lem.getLemmaName(); 
              }
              indexKeys.put(indexKey, indexKey);
            }
          }
        }
      }
    }
    ArrayList<String> result = new ArrayList<String>();
    if (indexKeys != null) {
      Enumeration<String> indexKeysKeys = indexKeys.keys();
      while(indexKeysKeys.hasMoreElements()) {
        String indexKey = indexKeysKeys.nextElement();
        result.add(indexKey);
      }
    }
    Collections.sort(result);
    if (result.isEmpty())
      return null;
    else 
      return result;
  }
  
  private void clearCache() {
    forms = null;
    lemmas = null;
    forms = new Hashtable<String, Hashtable<String, Lemma>>();
    lemmas = new Hashtable<String, Lemma>(); 
  }

  private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName);
    // is set off because Snowball is not used anymore
    // ArrayList<Lemma> lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName);
    // lemmasStatic.addAll(lemmasDynamic);
    return lemmasStatic;
  }

  private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException {
    String language = Language.getInstance().getLanguageId(lang);
    ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName);
    // is set off because Snowball is not used anymore
    // ArrayList<Form> formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName);
    // formsStatic.addAll(formsDynamic);
    return formsStatic;
  }
  
  private void writeFormLemmaDynamic(Form newFlatForm, Lemma newFlatLemma) throws ApplicationException {
    dbMorphHandlerDynamic.writeFormLemma(newFlatForm, newFlatLemma);
    dbMorphHandlerDynamic.writeLemmaForm(newFlatLemma, newFlatForm);
  }
  
  private void openDynamicFormsDataFile() throws ApplicationException {
    try {
      File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS);
      if (! dataFileDynamicForms.exists()) {
        FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS + ".empty", DATA_FILE_DYNAMIC_FORMS);
      }
      File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp");
      dataFileDynamicFormsTmp.delete();
      FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS, DATA_FILE_DYNAMIC_FORMS + ".tmp");
      FileUtil.getInstance().deleteLastNBytes(dataFileDynamicFormsTmp, 9);  // without last "</forms>" entry
      FileOutputStream dataFileOutputStreamDynamicForms = new FileOutputStream(dataFileDynamicFormsTmp, true);
      outputStreamDynamicForms = new BufferedOutputStream(dataFileOutputStreamDynamicForms);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void closeDynamicFormsDataFile() throws ApplicationException {
    try {
      writeToDynamicFile("</forms>\n");
      if (outputStreamDynamicForms != null)
        outputStreamDynamicForms.close();
      File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS);
      File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp");
      dataFileDynamicForms.delete();
      dataFileDynamicFormsTmp.renameTo(new File(DATA_FILE_DYNAMIC_FORMS));
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private void writeToDynamicFile(String outStr) throws ApplicationException {
    try {
      if (outputStreamDynamicForms != null) {
        byte[] bytes = outStr.getBytes("utf-8");
        outputStreamDynamicForms.write(bytes, 0, bytes.length);
        outputStreamDynamicForms.flush();
      }
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }
  
  private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
    LuceneUtil luceneUtil = LuceneUtil.getInstance();
    ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString);
    return variants;
  }

  private void beginOperation() {
    beginOfOperation = new Date();
  }

  private void endOperation() {
    endOfOperation = new Date();
  }
}