Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.Enumeration; import java.util.Hashtable; import java.util.logging.Logger; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; public class MorphologyCache { private static MorphologyCache instance; private static Logger LOGGER = Logger.getLogger(MorphologyCache.class.getName()); private static String DATA_DIR = Constants.getInstance().getDataDir(); private static String DB_DIR_DONATUS = DATA_DIR + "/dataBerkeleyDB/donatus"; public static int QUERY_MODE = 0; public static int DOCUMENT_MODE = 1; private static long MIN_RAM = 500000000; private static int MAX_HASHTABLE_SIZE = Constants.MORPHOLOGY_CACHE_SIZE; private Date touchTimer; protected int mode = QUERY_MODE; private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName private DBMorphHandler dbMorphHandler; // handles morph data (BerkeleyDB) private Date beginOfOperation; private Date endOfOperation; public static MorphologyCache getInstance() throws ApplicationException { if (instance == null) { instance = new MorphologyCache(); instance.init(); } return instance; } private void init() throws ApplicationException { long maxMemory = Runtime.getRuntime().maxMemory(); if (maxMemory < MIN_RAM) { String message = "Morphology cache: at least " + MIN_RAM + " is needed as heap space: please start java with parameter -Xmx with more than this value)"; LOGGER.severe(message); throw new ApplicationException(message); } touchTimer = new Date(); instance.beginOperation(); dbMorphHandler = new DBMorphHandler(DB_DIR_DONATUS); dbMorphHandler.startReadOnly(); dbMorphHandler.openDatabases(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); LOGGER.info("Morphology cache: morphology db opened read only (needed " + elapsedTime + " seconds, heap space: " + maxMemory + " bytes)"); } public int getMode() { return mode; } public void setMode(int newMode) { this.mode = newMode; } public void end() throws ApplicationException { dbMorphHandler.closeDatabases(); LOGGER.info("Morphology cache: db closed"); forms = null; lemmas = null; dbMorphHandler = null; instance = null; } public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> retFormLemmas = null; String formName = formNameArg; Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formName = normalizer.normalize(formNameArg); // first look in local cache String key = language + "###" + formName; Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); if (formLemmasHashtable == null) { ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); // put lemmas into local cache int localHashTableSize = forms.size(); Date now = new Date(); if (now.getTime() - touchTimer.getTime() > 900000) { // is true each 0,25 hours: then free memory is fetched (needs some time) touchTimer = new Date(); long freeMemory = Runtime.getRuntime().freeMemory(); LOGGER.info(touchTimer + ": Morphology cache: free memory in heap space: " + freeMemory + " bytes"); if (freeMemory < MIN_RAM || localHashTableSize >= MAX_HASHTABLE_SIZE) { // if freeMemory is less then MIN_RAM then clear cache to get some new memory clearCache(); freeMemory = Runtime.getRuntime().freeMemory(); LOGGER.info(touchTimer + ": Morphology cache: cache cleared, free memory in heap space: " + freeMemory + " bytes"); } } if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { formLemmasHashtable = new Hashtable<String, Lemma>(); for (int i=0; i<dbFormLemmas.size(); i++) { Lemma lemma = dbFormLemmas.get(i); String lemmaName = lemma.getLemmaName(); String lemmaKey = language + "###" + lemmaName; Lemma localLemma = lemmas.get(lemmaKey); if (localLemma == null) { ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); lemma.setForms(lemmaForms); lemmas.put(lemmaKey, lemma); } else { lemma = localLemma; } formLemmasHashtable.put(lemmaKey, lemma); } forms.put(key, formLemmasHashtable); } } retFormLemmas = new ArrayList<Lemma>(); if (formLemmasHashtable != null) { Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = formLemmasHashtable.get(lemmaKey); retFormLemmas.add(l); } } Collections.sort(retFormLemmas); return retFormLemmas; } public Lemma getLemma(String lang, String lemmaNameArg, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); String lemmaName = lemmaNameArg; Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); lemmaName = normalizer.normalize(lemmaNameArg); // first look in local cache String key = language + "###" + lemmaName; Lemma lemma = lemmas.get(key); if (lemma == null) { ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); if (dbLemmaForms != null && dbLemmaForms.size() > 0) { lemma = new Lemma(); lemma.setLemmaName(lemmaName); lemma.setLanguage(language); lemma.setProvider(dbLemmaForms.get(0).getProvider()); lemma.setForms(dbLemmaForms); lemmas.put(lemmaName, lemma); } } return lemma; } public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> result = new ArrayList<Form>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formStr = normalizer.normalize(formStr); ArrayList<Lemma> formLemmas = null; // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, normMode); } if (formLemmas != null && ! formLemmas.isEmpty()) { for (int j=0; j<formLemmas.size(); j++) { Lemma l = formLemmas.get(j); ArrayList<Form> lemmaForms = l.getFormsList(); result.addAll(lemmaForms); } } } } return result; } public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, int normMode) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); Normalizer normalizer = new Normalizer(language); normalizer.setNormMode(normMode); formStr = normalizer.normalize(formStr); ArrayList<Lemma> formLemmas = null; // lemma mode: if formName starts with "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, normMode); } if (formLemmas != null) { for (int j=0; j<formLemmas.size(); j++) { Lemma lemma = formLemmas.get(j); lemmas.put(lemma.getLemmaName(), lemma); } } } } ArrayList<Lemma> result = new ArrayList<Lemma>(); if (lemmas != null) { Enumeration<String> formLemmasKeys = lemmas.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = lemmas.get(lemmaKey); result.add(l); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, String> indexKeys = new Hashtable<String, String>(); for (int j=0; j<lemmaNames.size(); j++) { String lemmaName = lemmaNames.get(j); Lemma lemma = getLemma(language, lemmaName, Normalizer.NONE); indexKeys.put(lemmaName, lemmaName); if (lemma != null) { ArrayList<Form> lemmaForms = lemma.getFormsList(); for (int k=0; k<lemmaForms.size(); k++) { Form form = lemmaForms.get(k); ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), Normalizer.NONE); if (fLemmas != null) { String indexKey = ""; if (fLemmas.size() == 1) { indexKey = fLemmas.get(0).getLemmaName(); } else { for (int l=0; l<fLemmas.size(); l++) { Lemma lem = fLemmas.get(l); indexKey = indexKey + "+++" + lem.getLemmaName(); } indexKeys.put(indexKey, indexKey); } } } } } ArrayList<String> result = new ArrayList<String>(); if (indexKeys != null) { Enumeration<String> indexKeysKeys = indexKeys.keys(); while(indexKeysKeys.hasMoreElements()) { String indexKey = indexKeysKeys.nextElement(); result.add(indexKey); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } private void clearCache() { forms = null; lemmas = null; forms = new Hashtable<String, Hashtable<String, Lemma>>(); lemmas = new Hashtable<String, Lemma>(); } private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> lemmasStatic = dbMorphHandler.readLemmas(language, formName); return lemmasStatic; } private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> formsStatic = dbMorphHandler.readForms(language, lemmaName); return formsStatic; } private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { LuceneUtil luceneUtil = LuceneUtil.getInstance(); ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); return variants; } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }