Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 20:7d6d969b10cf
little corrections
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 14 Dec 2011 12:48:43 +0100 |
parents | 4a3641ae14d2 |
children | e845310098ba |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.Enumeration; import java.util.Hashtable; import java.util.logging.Logger; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; public class MorphologyCache { private static MorphologyCache instance; private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); private static String DATA_DIR = Constants.getInstance().getDataDir(); private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; public static int QUERY_MODE = 0; public static int DOCUMENT_MODE = 1; private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; protected int mode = QUERY_MODE; private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) private Date beginOfOperation; private Date endOfOperation; public static MorphologyCache getInstance() throws ApplicationException { if (instance == null) { instance = new MorphologyCache(); instance.init(); } return instance; } private void init() throws ApplicationException { instance.beginOperation(); dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); dbMorphHandlerStatic.start(); dbMorphHandlerStatic.openDatabases(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); LOGGER.info("Morphology db cache: opened (needed " + elapsedTime + " seconds)"); } public int getMode() { return mode; } public void setMode(int newMode) { this.mode = newMode; } public void end() throws ApplicationException { dbMorphHandlerStatic.closeDatabases(); LOGGER.info("Morphology db cache: closed"); } /* public ArrayList<Lemma> getLemmasByFormName(String lang, String formName, boolean normalize) throws ApplicationException { return getLemmasByFormName(lang, formName, normalize, Normalizer.DISPLAY); } */ public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> retFormLemmas = null; String formName = formNameArg; Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formName = normalizer.normalize(formNameArg); // first look in local cache String key = language + "###" + formName; Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); if (formLemmasHashtable == null) { ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); // put lemmas into local cache int localHashTableSize = forms.size(); if (localHashTableSize >= MAX_HASHTABLE_SIZE) { clearCache(); } if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { formLemmasHashtable = new Hashtable<String, Lemma>(); for (int i=0; i<dbFormLemmas.size(); i++) { Lemma lemma = dbFormLemmas.get(i); String lemmaName = lemma.getLemmaName(); String lemmaKey = language + "###" + lemmaName; Lemma localLemma = lemmas.get(lemmaKey); if (localLemma == null) { ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); lemma.setForms(lemmaForms); lemmas.put(lemmaKey, lemma); } else { lemma = localLemma; } formLemmasHashtable.put(lemmaKey, lemma); } forms.put(key, formLemmasHashtable); } } retFormLemmas = new ArrayList<Lemma>(); if (formLemmasHashtable != null) { Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = formLemmasHashtable.get(lemmaKey); retFormLemmas.add(l); } } Collections.sort(retFormLemmas); return retFormLemmas; } public Lemma getLemma(String lang, String lemmaNameArg, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); String lemmaName = lemmaNameArg; Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); lemmaName = normalizer.normalize(lemmaNameArg); // first look in local cache String key = language + "###" + lemmaName; Lemma lemma = lemmas.get(key); if (lemma == null) { ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); if (dbLemmaForms != null && dbLemmaForms.size() > 0) { lemma = new Lemma(); lemma.setLemmaName(lemmaName); lemma.setLanguage(language); lemma.setProvider(dbLemmaForms.get(0).getProvider()); lemma.setForms(dbLemmaForms); lemmas.put(lemmaName, lemma); } } return lemma; } public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> result = new ArrayList<Form>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formStr = normalizer.normalize(formStr); ArrayList<Lemma> formLemmas = null; // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, normMode); } if (formLemmas != null && ! formLemmas.isEmpty()) { for (int j=0; j<formLemmas.size(); j++) { Lemma l = formLemmas.get(j); ArrayList<Form> lemmaForms = l.getFormsList(); result.addAll(lemmaForms); } } } } return result; } public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formStr = normalizer.normalize(formStr); ArrayList<Lemma> formLemmas = null; // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, normMode); } if (formLemmas != null) { for (int j=0; j<formLemmas.size(); j++) { Lemma lemma = formLemmas.get(j); lemmas.put(lemma.getLemmaName(), lemma); } } } } ArrayList<Lemma> result = new ArrayList<Lemma>(); if (lemmas != null) { Enumeration<String> formLemmasKeys = lemmas.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = lemmas.get(lemmaKey); result.add(l); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, String> indexKeys = new Hashtable<String, String>(); for (int j=0; j<lemmaNames.size(); j++) { String lemmaName = lemmaNames.get(j); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); indexKeys.put(lemmaName, lemmaName); if (lemma != null) { ArrayList<Form> lemmaForms = lemma.getFormsList(); for (int k=0; k<lemmaForms.size(); k++) { Form form = lemmaForms.get(k); ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), Normalizer.NONE); if (fLemmas != null) { String indexKey = ""; if (fLemmas.size() == 1) { indexKey = fLemmas.get(0).getLemmaName(); } else { for (int l=0; l<fLemmas.size(); l++) { Lemma lem = fLemmas.get(l); indexKey = indexKey + "+++" + lem.getLemmaName(); } indexKeys.put(indexKey, indexKey); } } } } } ArrayList<String> result = new ArrayList<String>(); if (indexKeys != null) { Enumeration<String> indexKeysKeys = indexKeys.keys(); while(indexKeysKeys.hasMoreElements()) { String indexKey = indexKeysKeys.nextElement(); result.add(indexKey); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } private void clearCache() { forms = null; lemmas = null; forms = new Hashtable<String, Hashtable<String, Lemma>>(); lemmas = new Hashtable<String, Lemma>(); } private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); return lemmasStatic; } private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); return formsStatic; } private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { LuceneUtil luceneUtil = LuceneUtil.getInstance(); ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); return variants; } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }