Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.dict.db; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.logging.Logger; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import de.mpg.mpiwg.berlin.mpdl.util.Util; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexica; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.Lexicon; import de.mpg.mpiwg.berlin.mpdl.lt.dict.app.LexiconEntry; import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; import de.mpg.mpiwg.berlin.mpdl.lt.text.transcode.Transcoder; public class LexHandler { private static LexHandler instance; private static Logger LOGGER = Logger.getLogger(LexHandler.class.getName()); private static String DATA_DIR = Constants.getInstance().getDataDir(); private static String DB_DIR_LEXICA = DATA_DIR + "/dataBerkeleyDB/pollux"; private DbEnvLex dbEnvLexica; private Date beginOfOperation; private Date endOfOperation; public static LexHandler getInstance() throws ApplicationException { if (instance == null) { instance = new LexHandler(); instance.initReadOnly(); } return instance; } public void end() throws ApplicationException { ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); dbEnvLexica.closeDatabase(lexiconName); } dbEnvLexica.close(); LOGGER.info("Lexicon db cache: closed"); } /** * @param query * @param type * @param language * @param normalization * @return lemmas * @throws ApplicationException */ public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException { ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); // get lemmas of all forms in query MorphologyCache morphologyCache = MorphologyCache.getInstance(); String[] queryForms = query.split(" "); for (int k=0; k<queryForms.length; k++) { String queryForm = queryForms[k]; ArrayList<Lemma> lemmas = null; if (type.equals("form")) { if (normalization.equals("norm")) lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); else if (normalization.equals("none")) lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false); else lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm } else if (type.equals("lemma")) { lemmas = new ArrayList<Lemma>(); Lemma l = null; if (normalization.equals("norm")) l = morphologyCache.getLemma(language, queryForm, true); else if (normalization.equals("none")) l = morphologyCache.getLemma(language, queryForm, false); else l = morphologyCache.getLemma(language, queryForm, true); if (l != null) lemmas.add(l); } if (lemmas != null && ! lemmas.isEmpty()) { lexLemmas.addAll(lemmas); } else { Lemma l = new Lemma("created dynamically cause no lemma is available", language, queryForm); // at least the word form is added for finding it in the lexicon lexLemmas.add(l); } } Collections.sort(lexLemmas); if (lexLemmas.isEmpty()) return null; else return lexLemmas; } public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException { ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); if (lexiconName != null) { lexicons = new ArrayList<Lexicon>(); Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName); if (lexicon != null) lexicons.add(lexicon); } if (lexicons != null) { for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries for (int j=0; j<lexLemmas.size(); j++) { String lemmaName = lexLemmas.get(j).getLemmaName(); if (Language.getInstance().isGerman(language) && lemmaName.contains("ae")) lemmaName = lemmaName.replaceAll("ae", "Š"); if (Language.getInstance().isGerman(language) && lemmaName.contains("oe")) lemmaName = lemmaName.replaceAll("oe", "š"); if (Language.getInstance().isGerman(language) && lemmaName.contains("ue")) lemmaName = lemmaName.replaceAll("ue", "Ÿ"); if (Language.getInstance().isGerman(language) && lemmaName.contains("ss")) lemmaName = lemmaName.replaceAll("ss", "§"); LexiconEntry lexEntry = getEntry(lexicon, lemmaName); if (lexEntry != null) { lexicon.addEntry(lexEntry); // add entries to the cloned lexicon } } if (! lexicon.isEmpty()) retLexicons.add(lexicon); } } Collections.sort(retLexicons); return retLexicons; } /** * * @param formName * @param language * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) * @throws ApplicationException */ public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { ArrayList<String> lexEntryKeys = new ArrayList<String>(); MorphologyCache morphologyCache = MorphologyCache.getInstance(); ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); boolean hasLexEntry = false; hasLexEntry = hasLexEntryKey(formName, language); if (hasLexEntry) lexEntryKeys.add(formName); if (formLemmas != null) { for (int j=0; j<formLemmas.size(); j++) { Lemma l = formLemmas.get(j); String lName = l.getLemmaName(); if (! hasLexEntry) { hasLexEntry = hasLexEntryKey(lName, language); } if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.) lexEntryKeys.add(lName); if (! lName.equals(formName) && hasLexEntry) { lexEntryKeys.add(lName); } } } if(lexEntryKeys.isEmpty()) return null; else return lexEntryKeys; } public boolean hasLexEntryKey(String formName, String language) throws ApplicationException { boolean hasLexEntry = false; if (language.equals("zh")) // each chinese character always has a lexicon entry return true; ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); if (statLexicons != null) { for (int i=0; i<statLexicons.size(); i++) { Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries LexiconEntry lexEntry = readEntry(lexicon.getName(), formName); if (lexEntry != null) { return true; } } } return hasLexEntry; } public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { int pageSize = 50; int from = (pageNumber * pageSize) - pageSize + 1; int to = pageNumber * pageSize; ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); ArrayList<Lexicon> retLexicons = null; if (statLexicons != null) { for (int i=0; i<statLexicons.size(); i++) { Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries String lexiconName = lexicon.getName(); ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); // TODO merge the entries and remove duplicates if (lexEntries != null) { lexicon.addEntries(lexEntries); if (retLexicons == null) retLexicons = new ArrayList<Lexicon>(); retLexicons.add(lexicon); } } } return retLexicons; } public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { int pageSize = 50; int from = (pageNumber * pageSize) - pageSize + 1; int to = pageNumber * pageSize; Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); ArrayList<Lexicon> retLexicons = null; if (lexicon != null) { ArrayList<LexiconEntry> lexEntries = readEntriesBeginningWith(lexiconName, formPrefix, from, to); if (lexEntries != null) { lexicon.addEntries(lexEntries); retLexicons = new ArrayList<Lexicon>(); retLexicons.add(lexicon); } } return retLexicons; } private LexiconEntry getEntry(Lexicon lexicon, String formName) throws ApplicationException { LexiconEntry lexEntry = null; if (lexicon.isLocalLexicon()) { lexEntry = readEntry(lexicon.getName(), formName); String lexiconQueryUrl = lexicon.getQueryUrl(); if (lexEntry != null && lexicon.getQueryUrl() != null) { String language = lexicon.getSourceLanguage(); if (Language.getInstance().isGreek(language)) { formName = Transcoder.getInstance().transcodeFromUnicode2BetaCode(formName); } else if (Language.getInstance().isArabic(language)) { formName = Transcoder.getInstance().transcodeFromUnicode2Buckwalter(formName); } lexEntry.setRemoteUrl(lexiconQueryUrl + formName); } } else { lexEntry = lexicon.getDynamicEntry(formName); } return lexEntry; } private LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException { LexiconEntry retLexEntry = null; try { String dbFoundValueStr = null; String keyStr = formName; DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry foundValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); if (operationStatus == OperationStatus.SUCCESS) { byte[] foundValueBytes = foundValue.getData(); dbFoundValueStr = new String(foundValueBytes, "utf-8"); } cursor.close(); if (dbFoundValueStr != null) { retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); } } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return retLexEntry; } private ArrayList<LexiconEntry> readEntriesBeginningWith(String lexiconName, String formPrefix, int from, int to) throws ApplicationException { ArrayList<LexiconEntry> retLexEntries = new ArrayList<LexiconEntry>();; try { String dbFoundValueStr = null; String keyStr = formPrefix; DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); Cursor cursor = lexDB.openCursor(null, null); DatabaseEntry foundValue = new DatabaseEntry(); OperationStatus operationStatus = cursor.getSearchKeyRange(dbEntryKey, foundValue, LockMode.DEFAULT); int counter = 1; while (operationStatus == OperationStatus.SUCCESS && counter <= to) { if (counter >= from) { byte[] foundValueBytes = foundValue.getData(); dbFoundValueStr = new String(foundValueBytes, "utf-8"); byte[] foundKeyBytes = dbEntryKey.getData(); String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); retLexEntries.add(lexEntry); } operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); counter++; } cursor.close(); if (retLexEntries.isEmpty()) { return null; } } catch (DatabaseException e) { throw new ApplicationException(e); } catch (UnsupportedEncodingException e) { throw new ApplicationException(e); } return retLexEntries; } public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation(); System.out.print("Start ..."); instance.readSampleData(); instance.end(); instance.endOperation(); Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); System.out.println("End."); System.out.println("Needed time: " + elapsedTime + " seconds"); } private void initReadOnly() throws ApplicationException { dbEnvLexica = new DbEnvLex(); dbEnvLexica.setDataDir(DB_DIR_LEXICA); dbEnvLexica.initReadOnly(); ArrayList<Lexicon> lexicons = Lexica.getInstance().getLocalLexicons(); for (int i=0; i<lexicons.size(); i++) { Lexicon lexicon = lexicons.get(i); String lexiconName = lexicon.getName(); dbEnvLexica.openDatabase(lexiconName); } LOGGER.info("Lexicon db cache: opened"); } private void readSampleData() throws ApplicationException { // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames(); String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj String l2 = readEntry("ls", "laudabilis").getContent(); // latin System.out.println("Autenrieth: autos: " + l1); System.out.println("Lewis & Short: Laudabilis: " + l2); } private void beginOperation() { beginOfOperation = new Date(); } private void endOperation() { endOfOperation = new Date(); } }