Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 59ff47d1e237 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,402 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Enumeration; +import java.util.Hashtable; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; +import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil; +import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class MorphologyCache { + private static MorphologyCache instance; + private static Logger LOGGER = Logger.getLogger(MorphologyCache.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String DB_DIR_DONATUS = MPDL_DATA_DIR + "/dataBerkeleyDB/donatus"; + private static String DB_DIR_DYNAMIC = MPDL_DATA_DIR + "/dataBerkeleyDB/dynamic"; + private static String DATA_FILES_DIR = MPDL_DATA_DIR + "/dataFiles"; + private static String DATA_FILE_DYNAMIC_FORMS = DATA_FILES_DIR + "/snowball-all-forms.xml"; + public static int QUERY_MODE = 0; + public static int DOCUMENT_MODE = 1; + private static int MAX_HASHTABLE_SIZE = MpdlConstants.MORPHOLOGY_CACHE_SIZE; + protected int mode = QUERY_MODE; + private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName + private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName + private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) + private DBMorphHandler dbMorphHandlerDynamic; // handles dynamic morph data (BerkeleyDB) + private OutputStream outputStreamDynamicForms; // backup file for all dynamic forms + private Date beginOfOperation; + private Date endOfOperation; + + public static MorphologyCache getInstance() throws ApplicationException { + if (instance == null) { + instance = new MorphologyCache(); + instance.init(); + } + return instance; + } + + private void init() throws ApplicationException { + LOGGER.info("Mpdl: Init morphology cache ..."); + instance.beginOperation(); + dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); + dbMorphHandlerStatic.start(); + dbMorphHandlerStatic.openDatabases(); + dbMorphHandlerDynamic = new DBMorphHandler(DB_DIR_DYNAMIC); + dbMorphHandlerDynamic.start(); + dbMorphHandlerDynamic.openDatabases(); + openDynamicFormsDataFile(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + LOGGER.info(" Needed time: " + elapsedTime + " seconds."); + } + + public int getMode() { + return mode; + } + + public void setMode(int newMode) { + this.mode = newMode; + } + + public void end() throws ApplicationException { + dbMorphHandlerStatic.closeDatabases(); + dbMorphHandlerDynamic.closeDatabases(); + closeDynamicFormsDataFile(); + } + + public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Lemma> retFormLemmas = null; + String formName = formNameArg; + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + formName = normalizer.normalize(formNameArg); + } + // first look in local cache + String key = language + "###" + formName; + Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); + if (formLemmasHashtable == null) { + ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); + // put lemmas into local cache + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { + formLemmasHashtable = new Hashtable<String, Lemma>(); + for (int i=0; i<dbFormLemmas.size(); i++) { + Lemma lemma = dbFormLemmas.get(i); + String lemmaName = lemma.getLemmaName(); + String lemmaKey = language + "###" + lemmaName; + Lemma localLemma = lemmas.get(lemmaKey); + if (localLemma == null) { + ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); + lemma.setForms(lemmaForms); + lemmas.put(lemmaKey, lemma); + } else { + lemma = localLemma; + } + formLemmasHashtable.put(lemmaKey, lemma); + } + forms.put(key, formLemmasHashtable); + } + } + retFormLemmas = new ArrayList<Lemma>(); + if (formLemmasHashtable != null) { + Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = formLemmasHashtable.get(lemmaKey); + retFormLemmas.add(l); + } + } + Collections.sort(retFormLemmas); + return retFormLemmas; + } + + public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = lemmaNameArg; + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + lemmaName = normalizer.normalize(lemmaNameArg); + } + // first look in local cache + String key = language + "###" + lemmaName; + Lemma lemma = lemmas.get(key); + if (lemma == null) { + ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); + if (dbLemmaForms != null && dbLemmaForms.size() > 0) { + lemma = new Lemma(); + lemma.setLemmaName(lemmaName); + lemma.setLanguage(language); + lemma.setProvider(dbLemmaForms.get(0).getProvider()); + lemma.setForms(dbLemmaForms); + lemmas.put(lemmaName, lemma); + } + } + return lemma; + } + + public void insertFormDynamic(Form newFlatForm) throws ApplicationException { + if (! newFlatForm.isOk()) + return; + String provider = newFlatForm.getProvider(); + String lang = newFlatForm.getLanguage(); + String language = Language.getInstance().getLanguageId(lang); + String lemmaName = newFlatForm.getLemmaName(); + Lemma newFlatLemma = new Lemma(provider, language, lemmaName); + newFlatLemma.addForm(newFlatForm); + // write to berkeley db; there is no test if the form is already contained (has to be done before) + writeFormLemmaDynamic(newFlatForm, newFlatLemma); + // write to backup file + String formsXmlStr = newFlatForm.getXmlString(); + writeToDynamicFile(formsXmlStr); + // fill local cache with new form if it is not too full + int localHashTableSize = forms.size(); + if (localHashTableSize >= MAX_HASHTABLE_SIZE) { + clearCache(); + } + String lemmaKey = language + "###" + lemmaName; + Lemma localLemma = lemmas.get(lemmaKey); + if (localLemma == null) { + lemmas.put(lemmaKey, newFlatLemma); + } else { + localLemma.addForm(newFlatForm); + String formName = newFlatForm.getFormName(); + String formKey = language + "###" + formName; + Hashtable<String, Lemma> formLemmas = forms.get(formKey); + if (formLemmas == null) { + formLemmas = new Hashtable<String, Lemma>(); + formLemmas.put(lemmaKey, localLemma); + forms.put(formKey, formLemmas); + } else { + formLemmas.put(formKey, localLemma); + } + } + } + + public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Form> result = new ArrayList<Form>(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i<formsFromQuery.size(); i++) { + String formStr = formsFromQuery.get(i); + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + formStr = normalizer.normalize(formStr); + } + ArrayList<Lemma> formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList<Lemma>(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null && ! formLemmas.isEmpty()) { + for (int j=0; j<formLemmas.size(); j++) { + Lemma l = formLemmas.get(j); + ArrayList<Form> lemmaForms = l.getFormsList(); + result.addAll(lemmaForms); + } + } + } + } + return result; + } + + public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); + luceneQueryString = luceneQueryString.toLowerCase(); + ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); + if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { + for (int i=0; i<formsFromQuery.size(); i++) { + String formStr = formsFromQuery.get(i); + if (normalize) { + MpdlNormalizer normalizer = new MpdlNormalizer(language); + formStr = normalizer.normalize(formStr); + } + ArrayList<Lemma> formLemmas = null; + // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched + if (formStr.startsWith("lemmalemma")) { + formLemmas = new ArrayList<Lemma>(); + String lemmaName = formStr.substring(10); + Lemma lemma = getLemma(language, lemmaName, false); + formLemmas.add(lemma); + } else { + formLemmas = getLemmasByFormName(language, formStr, false); + } + if (formLemmas != null) { + for (int j=0; j<formLemmas.size(); j++) { + Lemma lemma = formLemmas.get(j); + lemmas.put(lemma.getLemmaName(), lemma); + } + } + } + } + ArrayList<Lemma> result = new ArrayList<Lemma>(); + if (lemmas != null) { + Enumeration<String> formLemmasKeys = lemmas.keys(); + while(formLemmasKeys.hasMoreElements()) { + String lemmaKey = formLemmasKeys.nextElement(); + Lemma l = lemmas.get(lemmaKey); + result.add(l); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + Hashtable<String, String> indexKeys = new Hashtable<String, String>(); + for (int j=0; j<lemmaNames.size(); j++) { + String lemmaName = lemmaNames.get(j); + Lemma lemma = getLemma(language, lemmaName, false); + indexKeys.put(lemmaName, lemmaName); + if (lemma != null) { + ArrayList<Form> lemmaForms = lemma.getFormsList(); + for (int k=0; k<lemmaForms.size(); k++) { + Form form = lemmaForms.get(k); + ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false); + if (fLemmas != null) { + String indexKey = ""; + if (fLemmas.size() == 1) { + indexKey = fLemmas.get(0).getLemmaName(); + } else { + for (int l=0; l<fLemmas.size(); l++) { + Lemma lem = fLemmas.get(l); + indexKey = indexKey + "+++" + lem.getLemmaName(); + } + indexKeys.put(indexKey, indexKey); + } + } + } + } + } + ArrayList<String> result = new ArrayList<String>(); + if (indexKeys != null) { + Enumeration<String> indexKeysKeys = indexKeys.keys(); + while(indexKeysKeys.hasMoreElements()) { + String indexKey = indexKeysKeys.nextElement(); + result.add(indexKey); + } + } + Collections.sort(result); + if (result.isEmpty()) + return null; + else + return result; + } + + private void clearCache() { + forms = null; + lemmas = null; + forms = new Hashtable<String, Hashtable<String, Lemma>>(); + lemmas = new Hashtable<String, Lemma>(); + } + + private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); + ArrayList<Lemma> lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName); + lemmasStatic.addAll(lemmasDynamic); + return lemmasStatic; + } + + private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { + String language = Language.getInstance().getLanguageId(lang); + ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); + ArrayList<Form> formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName); + formsStatic.addAll(formsDynamic); + return formsStatic; + } + + private void writeFormLemmaDynamic(Form newFlatForm, Lemma newFlatLemma) throws ApplicationException { + dbMorphHandlerDynamic.writeFormLemma(newFlatForm, newFlatLemma); + dbMorphHandlerDynamic.writeLemmaForm(newFlatLemma, newFlatForm); + } + + private void openDynamicFormsDataFile() throws ApplicationException { + try { + File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); + if (! dataFileDynamicForms.exists()) { + FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS + ".empty", DATA_FILE_DYNAMIC_FORMS); + } + File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); + dataFileDynamicFormsTmp.delete(); + FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS, DATA_FILE_DYNAMIC_FORMS + ".tmp"); + FileUtil.getInstance().deleteLastNBytes(dataFileDynamicFormsTmp, 9); // without last "</forms>" entry + FileOutputStream dataFileOutputStreamDynamicForms = new FileOutputStream(dataFileDynamicFormsTmp, true); + outputStreamDynamicForms = new BufferedOutputStream(dataFileOutputStreamDynamicForms); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void closeDynamicFormsDataFile() throws ApplicationException { + try { + writeToDynamicFile("</forms>\n"); + if (outputStreamDynamicForms != null) + outputStreamDynamicForms.close(); + File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); + File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); + dataFileDynamicForms.delete(); + dataFileDynamicFormsTmp.renameTo(new File(DATA_FILE_DYNAMIC_FORMS)); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private void writeToDynamicFile(String outStr) throws ApplicationException { + try { + if (outputStreamDynamicForms != null) { + byte[] bytes = outStr.getBytes("utf-8"); + outputStreamDynamicForms.write(bytes, 0, bytes.length); + outputStreamDynamicForms.flush(); + } + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { + LuceneUtil luceneUtil = LuceneUtil.getInstance(); + ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); + return variants; + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } +} \ No newline at end of file