Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/morph/app/MorphologyCache.java @ 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 11 Mar 2011 13:33:26 +0100 |
parents | 408254cf2f1d |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.morph.app; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.Enumeration; import java.util.Hashtable; import org.apache.log4j.Logger; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.db.DBMorphHandler; import de.mpg.mpiwg.berlin.mpdl.lucene.LuceneUtil; import de.mpg.mpiwg.berlin.mpdl.util.FileUtil; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; public class MorphologyCache { private static MorphologyCache instance; private static Logger LOGGER = Logger.getLogger(MorphologyCache.class); // Logs to EXIST_HOME/webapp/WEB-INF/logs/exist.log private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; private static String DB_DIR_DONATUS = MPDL_DATA_DIR + "/dataBerkeleyDB/donatus"; private static String DB_DIR_DYNAMIC = MPDL_DATA_DIR + "/dataBerkeleyDB/dynamic"; private static String DATA_FILES_DIR = MPDL_DATA_DIR + "/dataFiles"; private static String DATA_FILE_DYNAMIC_FORMS = DATA_FILES_DIR + "/snowball-all-forms.xml"; public static int QUERY_MODE = 0; public static int DOCUMENT_MODE = 1; private static int MAX_HASHTABLE_SIZE = MpdlConstants.MORPHOLOGY_CACHE_SIZE; protected int mode = QUERY_MODE; private Hashtable<String, Hashtable<String, Lemma>> forms = new Hashtable<String, Hashtable<String, Lemma>>(); // cache of forms: hashKey is formName private Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); // cache of lemmas: hashKey is lemmaName private DBMorphHandler dbMorphHandlerStatic; // handles static morph data (BerkeleyDB) private DBMorphHandler dbMorphHandlerDynamic; // handles dynamic morph data (BerkeleyDB) private OutputStream outputStreamDynamicForms; // backup file for all dynamic forms private Date beginOfOperation; private Date endOfOperation; public static MorphologyCache getInstance() throws ApplicationException { if (instance == null) { instance = new MorphologyCache(); instance.init(); } return instance; } private void init() throws ApplicationException { LOGGER.info("Mpdl: Init morphology cache ..."); instance.beginOperation(); dbMorphHandlerStatic = new DBMorphHandler(DB_DIR_DONATUS); dbMorphHandlerStatic.start(); dbMorphHandlerStatic.openDatabases(); dbMorphHandlerDynamic = new DBMorphHandler(DB_DIR_DYNAMIC); dbMorphHandlerDynamic.start(); dbMorphHandlerDynamic.openDatabases(); openDynamicFormsDataFile(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); LOGGER.info(" Needed time: " + elapsedTime + " seconds."); } public int getMode() { return mode; } public void setMode(int newMode) { this.mode = newMode; } public void end() throws ApplicationException { dbMorphHandlerStatic.closeDatabases(); dbMorphHandlerDynamic.closeDatabases(); closeDynamicFormsDataFile(); } public ArrayList<Lemma> getLemmasByFormName(String lang, String formNameArg, boolean normalize) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> retFormLemmas = null; String formName = formNameArg; if (normalize) { MpdlNormalizer normalizer = new MpdlNormalizer(language); formName = normalizer.normalize(formNameArg); } // first look in local cache String key = language + "###" + formName; Hashtable<String, Lemma> formLemmasHashtable = forms.get(key); if (formLemmasHashtable == null) { ArrayList<Lemma> dbFormLemmas = readLemmasByFormName(language, formName); // put lemmas into local cache int localHashTableSize = forms.size(); if (localHashTableSize >= MAX_HASHTABLE_SIZE) { clearCache(); } if (dbFormLemmas != null && ! dbFormLemmas.isEmpty()) { formLemmasHashtable = new Hashtable<String, Lemma>(); for (int i=0; i<dbFormLemmas.size(); i++) { Lemma lemma = dbFormLemmas.get(i); String lemmaName = lemma.getLemmaName(); String lemmaKey = language + "###" + lemmaName; Lemma localLemma = lemmas.get(lemmaKey); if (localLemma == null) { ArrayList<Form> lemmaForms = readFormsByLemmaName(language, lemmaName); lemma.setForms(lemmaForms); lemmas.put(lemmaKey, lemma); } else { lemma = localLemma; } formLemmasHashtable.put(lemmaKey, lemma); } forms.put(key, formLemmasHashtable); } } retFormLemmas = new ArrayList<Lemma>(); if (formLemmasHashtable != null) { Enumeration<String> formLemmasKeys = formLemmasHashtable.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = formLemmasHashtable.get(lemmaKey); retFormLemmas.add(l); } } Collections.sort(retFormLemmas); return retFormLemmas; } public Lemma getLemma(String lang, String lemmaNameArg, boolean normalize) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); String lemmaName = lemmaNameArg; if (normalize) { MpdlNormalizer normalizer = new MpdlNormalizer(language); lemmaName = normalizer.normalize(lemmaNameArg); } // first look in local cache String key = language + "###" + lemmaName; Lemma lemma = lemmas.get(key); if (lemma == null) { ArrayList<Form> dbLemmaForms = readFormsByLemmaName(language, lemmaName); if (dbLemmaForms != null && dbLemmaForms.size() > 0) { lemma = new Lemma(); lemma.setLemmaName(lemmaName); lemma.setLanguage(language); lemma.setProvider(dbLemmaForms.get(0).getProvider()); lemma.setForms(dbLemmaForms); lemmas.put(lemmaName, lemma); } } return lemma; } public void insertFormDynamic(Form newFlatForm) throws ApplicationException { if (! newFlatForm.isOk()) return; String provider = newFlatForm.getProvider(); String lang = newFlatForm.getLanguage(); String language = Language.getInstance().getLanguageId(lang); String lemmaName = newFlatForm.getLemmaName(); Lemma newFlatLemma = new Lemma(provider, language, lemmaName); newFlatLemma.addForm(newFlatForm); // write to berkeley db; there is no test if the form is already contained (has to be done before) writeFormLemmaDynamic(newFlatForm, newFlatLemma); // write to backup file String formsXmlStr = newFlatForm.getXmlString(); writeToDynamicFile(formsXmlStr); // fill local cache with new form if it is not too full int localHashTableSize = forms.size(); if (localHashTableSize >= MAX_HASHTABLE_SIZE) { clearCache(); } String lemmaKey = language + "###" + lemmaName; Lemma localLemma = lemmas.get(lemmaKey); if (localLemma == null) { lemmas.put(lemmaKey, newFlatLemma); } else { localLemma.addForm(newFlatForm); String formName = newFlatForm.getFormName(); String formKey = language + "###" + formName; Hashtable<String, Lemma> formLemmas = forms.get(formKey); if (formLemmas == null) { formLemmas = new Hashtable<String, Lemma>(); formLemmas.put(lemmaKey, localLemma); forms.put(formKey, formLemmas); } else { formLemmas.put(formKey, localLemma); } } } public ArrayList<Form> getFormsByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> result = new ArrayList<Form>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); if (normalize) { MpdlNormalizer normalizer = new MpdlNormalizer(language); formStr = normalizer.normalize(formStr); } ArrayList<Lemma> formLemmas = null; // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, false); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, false); } if (formLemmas != null && ! formLemmas.isEmpty()) { for (int j=0; j<formLemmas.size(); j++) { Lemma l = formLemmas.get(j); ArrayList<Form> lemmaForms = l.getFormsList(); result.addAll(lemmaForms); } } } } return result; } public ArrayList<Lemma> getLemmasByLuceneQuery(String lang, String luceneQueryString, boolean normalize) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, Lemma> lemmas = new Hashtable<String, Lemma>(); luceneQueryString = luceneQueryString.toLowerCase(); ArrayList<String> formsFromQuery = getVariantsFromLuceneQuery(luceneQueryString); if (! (formsFromQuery == null || formsFromQuery.isEmpty())) { for (int i=0; i<formsFromQuery.size(); i++) { String formStr = formsFromQuery.get(i); if (normalize) { MpdlNormalizer normalizer = new MpdlNormalizer(language); formStr = normalizer.normalize(formStr); } ArrayList<Lemma> formLemmas = null; // lemma mode: if formName contains "lemmalemma" then the lemma itself is fetched if (formStr.startsWith("lemmalemma")) { formLemmas = new ArrayList<Lemma>(); String lemmaName = formStr.substring(10); Lemma lemma = getLemma(language, lemmaName, false); formLemmas.add(lemma); } else { formLemmas = getLemmasByFormName(language, formStr, false); } if (formLemmas != null) { for (int j=0; j<formLemmas.size(); j++) { Lemma lemma = formLemmas.get(j); lemmas.put(lemma.getLemmaName(), lemma); } } } } ArrayList<Lemma> result = new ArrayList<Lemma>(); if (lemmas != null) { Enumeration<String> formLemmasKeys = lemmas.keys(); while(formLemmasKeys.hasMoreElements()) { String lemmaKey = formLemmasKeys.nextElement(); Lemma l = lemmas.get(lemmaKey); result.add(l); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } public ArrayList<String> getIndexKeysByLemmaNames(String lang, ArrayList<String> lemmaNames) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); Hashtable<String, String> indexKeys = new Hashtable<String, String>(); for (int j=0; j<lemmaNames.size(); j++) { String lemmaName = lemmaNames.get(j); Lemma lemma = getLemma(language, lemmaName, false); indexKeys.put(lemmaName, lemmaName); if (lemma != null) { ArrayList<Form> lemmaForms = lemma.getFormsList(); for (int k=0; k<lemmaForms.size(); k++) { Form form = lemmaForms.get(k); ArrayList<Lemma> fLemmas = getLemmasByFormName(language, form.getFormName(), false); if (fLemmas != null) { String indexKey = ""; if (fLemmas.size() == 1) { indexKey = fLemmas.get(0).getLemmaName(); } else { for (int l=0; l<fLemmas.size(); l++) { Lemma lem = fLemmas.get(l); indexKey = indexKey + "+++" + lem.getLemmaName(); } indexKeys.put(indexKey, indexKey); } } } } } ArrayList<String> result = new ArrayList<String>(); if (indexKeys != null) { Enumeration<String> indexKeysKeys = indexKeys.keys(); while(indexKeysKeys.hasMoreElements()) { String indexKey = indexKeysKeys.nextElement(); result.add(indexKey); } } Collections.sort(result); if (result.isEmpty()) return null; else return result; } private void clearCache() { forms = null; lemmas = null; forms = new Hashtable<String, Hashtable<String, Lemma>>(); lemmas = new Hashtable<String, Lemma>(); } private ArrayList<Lemma> readLemmasByFormName(String lang, String formName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Lemma> lemmasStatic = dbMorphHandlerStatic.readLemmas(language, formName); // is set off because Snowball is not used anymore // ArrayList<Lemma> lemmasDynamic = dbMorphHandlerDynamic.readLemmas(language, formName); // lemmasStatic.addAll(lemmasDynamic); return lemmasStatic; } private ArrayList<Form> readFormsByLemmaName(String lang, String lemmaName) throws ApplicationException { String language = Language.getInstance().getLanguageId(lang); ArrayList<Form> formsStatic = dbMorphHandlerStatic.readForms(language, lemmaName); // is set off because Snowball is not used anymore // ArrayList<Form> formsDynamic = dbMorphHandlerDynamic.readForms(language, lemmaName); // formsStatic.addAll(formsDynamic); return formsStatic; } private void writeFormLemmaDynamic(Form newFlatForm, Lemma newFlatLemma) throws ApplicationException { dbMorphHandlerDynamic.writeFormLemma(newFlatForm, newFlatLemma); dbMorphHandlerDynamic.writeLemmaForm(newFlatLemma, newFlatForm); } private void openDynamicFormsDataFile() throws ApplicationException { try { File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); if (! dataFileDynamicForms.exists()) { FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS + ".empty", DATA_FILE_DYNAMIC_FORMS); } File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); dataFileDynamicFormsTmp.delete(); FileUtil.getInstance().copyFile(DATA_FILE_DYNAMIC_FORMS, DATA_FILE_DYNAMIC_FORMS + ".tmp"); FileUtil.getInstance().deleteLastNBytes(dataFileDynamicFormsTmp, 9); // without last "</forms>" entry FileOutputStream dataFileOutputStreamDynamicForms = new FileOutputStream(dataFileDynamicFormsTmp, true); outputStreamDynamicForms = new BufferedOutputStream(dataFileOutputStreamDynamicForms); } catch (IOException e) { throw new ApplicationException(e); } } private void closeDynamicFormsDataFile() throws ApplicationException { try { writeToDynamicFile("</forms>\n"); if (outputStreamDynamicForms != null) outputStreamDynamicForms.close(); File dataFileDynamicForms = new File(DATA_FILE_DYNAMIC_FORMS); File dataFileDynamicFormsTmp = new File(DATA_FILE_DYNAMIC_FORMS + ".tmp"); dataFileDynamicForms.delete(); dataFileDynamicFormsTmp.renameTo(new File(DATA_FILE_DYNAMIC_FORMS)); } catch (IOException e) { throw new ApplicationException(e); } } private void writeToDynamicFile(String outStr) throws ApplicationException { try { if (outputStreamDynamicForms != null) { byte[] bytes = outStr.getBytes("utf-8"); outputStreamDynamicForms.write(bytes, 0, bytes.length); outputStreamDynamicForms.flush(); } } catch (IOException e) { throw new ApplicationException(e); } } private ArrayList<String> getVariantsFromLuceneQuery(String queryString) { LuceneUtil luceneUtil = LuceneUtil.getInstance(); ArrayList<String> variants = luceneUtil.getVariantsFromLuceneQuery(queryString); return variants; } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }