diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children 7d6d969b10cf
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java	Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,295 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.morph.app;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.Enumeration;
+import java.util.Hashtable;
+
+import java.util.logging.Logger;
+
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
+import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class MorphologyCache {
+  private static MorphologyCache instance;
+  private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName());
+  private static String DATA_DIR = Constants.getInstance().getDataDir();
+  private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus";
+  public static int QUERY_MODE = 0;
+  public static int DOCUMENT_MODE = 1;
+  private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE;
+  protected int mode = QUERY_MODE;
+  private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>();  // cache of forms: hashKey is formName
+  private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();  // cache of lemmas: hashKey is lemmaName
+  private DBMorphHandler dbMorphHandlerStatic;  // handles static morph data (BerkeleyDB)
+  private Date beginOfOperation;
+  private Date endOfOperation;
+  
+  public static MorphologyCache getInstance() throws ApplicationException {
+    if (instance == null) {
+      instance = new MorphologyCache();
+      instance.init();
+    }
+    return instance;
+  }
+
+  private void init() throws ApplicationException {
+    instance.beginOperation();
+    dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS);
+    dbMorphHandlerStatic.start();
+    dbMorphHandlerStatic.openDatabases();
+    instance.endOperation();
+    Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+    LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)");
+  }
+  
+  public int getMode() {
+    return mode;
+  }
+  
+  public void setMode(int newMode) {
+    this.mode = newMode;
+  }
+  
+  public void end() throws ApplicationException {
+    dbMorphHandlerStatic.closeDatabases();
+    LOGGER.info("Morphology db cache: closed");
+  }
+
+  public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    ArrayList<Lemma> retFormLemmas = null;
+    String formName = formNameArg;
+    if (normalize) {
+      Normalizer normalizer = new Normalizer(language);
+      formName = normalizer.normalize(formNameArg);
+    }
+    // first look in local cache
+    String key = language + "###" + formName;
+    Hashtable<String, Lemma> formLemmasHashtable = forms.get(key);
+    if (formLemmasHashtable == null) {
+      ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName);
+      // put lemmas into local cache
+      int localHashTableSize = forms.size();
+      if (localHashTableSize >= MAX_HASHTABLE_SIZE) {
+        clearCache();
+      }
+      if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) {
+        formLemmasHashtable = new Hashtable<String, Lemma>();
+        for (int i=0; i<dbFormLemmas.size(); i++) {
+          Lemma lemma = dbFormLemmas.get(i);
+          String lemmaName = lemma.getLemmaName();
+          String lemmaKey = language + "###" + lemmaName;
+          Lemma localLemma = lemmas.get(lemmaKey);
+          if (localLemma == null) {
+            ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName);
+            lemma.setForms(lemmaForms);
+            lemmas.put(lemmaKey, lemma);
+          } else {
+            lemma = localLemma;
+          }
+          formLemmasHashtable.put(lemmaKey, lemma);
+        }
+        forms.put(key, formLemmasHashtable);
+      }
+    } 
+    retFormLemmas = new ArrayList<Lemma>();
+    if (formLemmasHashtable != null) {
+      Enumeration<String> formLemmasKeys = formLemmasHashtable.keys();
+      while(formLemmasKeys.hasMoreElements()) {
+        String lemmaKey = formLemmasKeys.nextElement();
+        Lemma l = formLemmasHashtable.get(lemmaKey);
+        retFormLemmas.add(l);
+      }
+    }
+    Collections.sort(retFormLemmas);
+    return retFormLemmas;
+  }
+  
+  public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    String lemmaName = lemmaNameArg;
+    if (normalize) {
+      Normalizer normalizer = new Normalizer(language);
+      lemmaName = normalizer.normalize(lemmaNameArg);
+    }
+    // first look in local cache
+    String key = language + "###" + lemmaName;
+    Lemma lemma = lemmas.get(key);
+    if (lemma == null) {
+      ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName);
+      if (dbLemmaForms != null && dbLemmaForms.size() > 0) {
+        lemma = new Lemma();
+        lemma.setLemmaName(lemmaName);
+        lemma.setLanguage(language);
+        lemma.setProvider(dbLemmaForms.get(0).getProvider());
+        lemma.setForms(dbLemmaForms);
+        lemmas.put(lemmaName, lemma);
+      }
+    }
+    return lemma;
+  }
+  
+  public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    ArrayList<Form> result = new ArrayList<Form>();
+    luceneQueryString = luceneQueryString.toLowerCase();
+    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
+    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
+      for (int i=0; i<formsFromQuery.size(); i++) {
+        String formStr = formsFromQuery.get(i);
+        if (normalize) {
+          Normalizer normalizer = new Normalizer(language);
+          formStr = normalizer.normalize(formStr);
+        }
+        ArrayList<Lemma> formLemmas = null;
+        // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched
+        if (formStr.startsWith("lemmalemma")) {
+          formLemmas = new ArrayList<Lemma>();
+          String lemmaName = formStr.substring(10);
+          Lemma lemma = getLemma(language, lemmaName, false);
+          formLemmas.add(lemma);
+        } else {
+          formLemmas = getLemmasByFormName(language, formStr, false);
+        }
+        if (formLemmas != null && ! formLemmas.isEmpty()) {
+          for (int j=0; j<formLemmas.size(); j++) {
+            Lemma l = formLemmas.get(j);
+            ArrayList<Form> lemmaForms = l.getFormsList();
+            result.addAll(lemmaForms);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>();
+    luceneQueryString = luceneQueryString.toLowerCase();
+    ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString);
+    if (! (formsFromQuery == null || formsFromQuery.isEmpty())) {
+      for (int i=0; i<formsFromQuery.size(); i++) {
+        String formStr = formsFromQuery.get(i);
+        if (normalize) {
+          Normalizer normalizer = new Normalizer(language);
+          formStr = normalizer.normalize(formStr);
+        }
+        ArrayList<Lemma> formLemmas = null;
+        // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched
+        if (formStr.startsWith("lemmalemma")) {
+          formLemmas = new ArrayList<Lemma>();
+          String lemmaName = formStr.substring(10);
+          Lemma lemma = getLemma(language, lemmaName, false);
+          formLemmas.add(lemma);
+        } else {
+          formLemmas = getLemmasByFormName(language, formStr, false);
+        }
+        if (formLemmas != null) {
+          for (int j=0; j<formLemmas.size(); j++) {
+            Lemma lemma = formLemmas.get(j);
+            lemmas.put(lemma.getLemmaName(), lemma);
+          }
+        }
+      }
+    }
+    ArrayList<Lemma> result = new ArrayList<Lemma>();
+    if (lemmas != null) {
+      Enumeration<String> formLemmasKeys = lemmas.keys();
+      while(formLemmasKeys.hasMoreElements()) {
+        String lemmaKey = formLemmasKeys.nextElement();
+        Lemma l = lemmas.get(lemmaKey);
+        result.add(l);
+      }
+    }
+    Collections.sort(result);
+    if (result.isEmpty())
+      return null;
+    else 
+      return result;
+  }
+  
+  public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    Hashtable<String, String> indexKeys = new Hashtable<String, String>();
+    for (int j=0; j<lemmaNames.size(); j++) {
+      String lemmaName = lemmaNames.get(j);
+      Lemma lemma = getLemma(language, lemmaName, false);
+      indexKeys.put(lemmaName, lemmaName);
+      if (lemma != null) {
+        ArrayList<Form> lemmaForms = lemma.getFormsList();
+        for (int k=0; k<lemmaForms.size(); k++) {
+          Form form = lemmaForms.get(k);
+          ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false);
+          if (fLemmas != null) {
+            String indexKey = "";
+            if (fLemmas.size() == 1) {
+              indexKey = fLemmas.get(0).getLemmaName();
+            } else {
+              for (int l=0; l<fLemmas.size(); l++) {
+                Lemma lem = fLemmas.get(l);
+                indexKey = indexKey + "+++" + lem.getLemmaName(); 
+              }
+              indexKeys.put(indexKey, indexKey);
+            }
+          }
+        }
+      }
+    }
+    ArrayList<String> result = new ArrayList<String>();
+    if (indexKeys != null) {
+      Enumeration<String> indexKeysKeys = indexKeys.keys();
+      while(indexKeysKeys.hasMoreElements()) {
+        String indexKey = indexKeysKeys.nextElement();
+        result.add(indexKey);
+      }
+    }
+    Collections.sort(result);
+    if (result.isEmpty())
+      return null;
+    else 
+      return result;
+  }
+  
+  private void clearCache() {
+    forms = null;
+    lemmas = null;
+    forms = new Hashtable<String, Hashtable<String, Lemma>>();
+    lemmas = new Hashtable<String, Lemma>(); 
+  }
+
+  private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName);
+    return lemmasStatic;
+  }
+
+  private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException {
+    String language = Language.getInstance().getLanguageId(lang);
+    ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName);
+    return formsStatic;
+  }
+    
+  private ArrayList<String> getVariantsFromLuceneQuery(String queryString) {
+    LuceneUtil luceneUtil = LuceneUtil.getInstance();
+    ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString);
+    return variants;
+  }
+
+  private void beginOperation() {
+    beginOfOperation = new Date();
+  }
+
+  private void endOperation() {
+    endOfOperation = new Date();
+  }
+}
\ No newline at end of file