Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,295 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import java.util.logging.Logger; + +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; +import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class MorphologyCache { + private static MorphologyCache instance; + private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); + private static String DATA_DIR = Constants.getInstance().getDataDir(); + private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; + protected int mode = QUERY_MODE; + private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName + private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName + private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) + private Date beginOfOperation; + private Date endOfOperation; + + public static MorphologyCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new MorphologyCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + instance.beginOperation(); + dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandlerStatic.start(); + dbMorphHandlerStatic.openDatabases(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)"); + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + } + + public void end() throws ApplicationException { + dbMorphHandlerStatic.closeDatabases(); + LOGGER.info("Morphology db cache: closed"); + } + + public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Lemma> retFormLemmas = null; + String formName = formNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + formName = normalizer.normalize(formNameArg); + } + // first look in local cache + String key = language + "###" + formName; + Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); + if (formLemmasHashtable == null) { + ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); + // put lemmas into local cache + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { + formLemmasHashtable = new Hashtable<String, Lemma>(); + for (int i=0; i<dbFormLemmas.size(); i++) { + Lemma lemma = dbFormLemmas.get(i); + String lemmaName = lemma.getLemmaName(); + String lemmaKey = language + "###" + lemmaName; + Lemma localLemma = lemmas.get(lemmaKey); + if (localLemma == null) { + ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); + lemma.setForms(lemmaForms); + lemmas.put(lemmaKey, lemma); + } else { + lemma = localLemma; + } + formLemmasHashtable.put(lemmaKey, lemma); + } + forms.put(key, formLemmasHashtable); + } + } + retFormLemmas = new ArrayList<Lemma>(); + if (formLemmasHashtable != null) { + Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = formLemmasHashtable.get(lemmaKey); + retFormLemmas.add(l); + } + } + Collections.sort(retFormLemmas); + return retFormLemmas; + } + + public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = lemmaNameArg; + if (normalize) { + Normalizer normalizer = new Normalizer(language); + lemmaName = normalizer.normalize(lemmaNameArg); + } + // first look in local cache + String key = language + "###" + lemmaName; + Lemma lemma = lemmas.get(key); + if (lemma == null) { + ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); + if (dbLemmaForms != null && dbLemmaForms.size() > 0) { + lemma = new Lemma(); + lemma.setLemmaName(lemmaName); + lemma.setLanguage(language); + lemma.setProvider(dbLemmaForms.get(0).getProvider()); + lemma.setForms(dbLemmaForms); + lemmas.put(lemmaName, lemma); + } + } + return lemma; + } + + public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Form> result = new ArrayList<Form>(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i<formsFromQuery.size(); i++) { + String formStr = formsFromQuery.get(i); + if (normalize) { + Normalizer normalizer = new Normalizer(language); + formStr = normalizer.normalize(formStr); + } + ArrayList<Lemma> formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList<Lemma>(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null && ! formLemmas.isEmpty()) { + for (int j=0; j<formLemmas.size(); j++) { + Lemma l = formLemmas.get(j); + ArrayList<Form> lemmaForms = l.getFormsList(); + result.addAll(lemmaForms); + } + } + } + } + return result; + } + + public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i<formsFromQuery.size(); i++) { + String formStr = formsFromQuery.get(i); + if (normalize) { + Normalizer normalizer = new Normalizer(language); + formStr = normalizer.normalize(formStr); + } + ArrayList<Lemma> formLemmas = null; + // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList<Lemma>(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null) { + for (int j=0; j<formLemmas.size(); j++) { + Lemma lemma = formLemmas.get(j); + lemmas.put(lemma.getLemmaName(), lemma); + } + } + } + } + ArrayList<Lemma> result = new ArrayList<Lemma>(); + if (lemmas != null) { + Enumeration<String> formLemmasKeys = lemmas.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = lemmas.get(lemmaKey); + result.add(l); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable<String, String> indexKeys = new Hashtable<String, String>(); + for (int j=0; j<lemmaNames.size(); j++) { + String lemmaName = lemmaNames.get(j); + Lemma lemma = getLemma(language, lemmaName, false); + indexKeys.put(lemmaName, lemmaName); + if (lemma != null) { + ArrayList<Form> lemmaForms = lemma.getFormsList(); + for (int k=0; k<lemmaForms.size(); k++) { + Form form = lemmaForms.get(k); + ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false); + if (fLemmas != null) { + String indexKey = ""; + if (fLemmas.size() == 1) { + indexKey = fLemmas.get(0).getLemmaName(); + } else { + for (int l=0; l<fLemmas.size(); l++) { + Lemma lem = fLemmas.get(l); + indexKey = indexKey + "+++" + lem.getLemmaName(); + } + indexKeys.put(indexKey, indexKey); + } + } + } + } + } + ArrayList<String> result = new ArrayList<String>(); + if (indexKeys != null) { + Enumeration<String> indexKeysKeys = indexKeys.keys(); + while(indexKeysKeys.hasMoreElements()) { + String indexKey = indexKeysKeys.nextElement(); + result.add(indexKey); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + private void clearCache() { + forms = null; + lemmas = null; + forms = new Hashtable<String, Hashtable<String, Lemma>>(); + lemmas = new Hashtable<String, Lemma>(); + } + + private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); + return lemmasStatic; + } + + private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); + return formsStatic; + } + + private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); + return variants; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } +} \ No newline at end of file