Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/lex/db/LexHandler.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,175 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.lex.db; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; + +import com.sleepycat.je.Cursor; +import com.sleepycat.je.Database; +import com.sleepycat.je.DatabaseEntry; +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.LockMode; +import com.sleepycat.je.OperationStatus; + +import de.mpg.mpiwg.berlin.mpdl.util.Util; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Transcoder; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexica; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.Lexicon; +import de.mpg.mpiwg.berlin.mpdl.lt.lex.app.LexiconEntry; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; + +public class LexHandler { + private static LexHandler instance; + private static String MPDL_DATA_DIR = MpdlConstants.MPDL_EXIST_DATA_DIR; + private static String DB_DIR_LEXICA = MPDL_DATA_DIR + "/dataBerkeleyDB/pollux"; + private DbEnvLex dbEnvLexica; + private Date beginOfOperation; + private Date endOfOperation; + + public static LexHandler getInstance() throws ApplicationException { + if (instance == null) { + instance = new LexHandler(); + instance.initReadOnly(); + } + return instance; + } + + /** + * + * @param formName + * @param language + * @return delivers lexical entries by help of the morphology component (lexical entry of the stem of the normalized word form) + * @throws ApplicationException + */ + public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { + ArrayList<String> lexEntryKeys = new ArrayList<String>(); + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); + boolean hasLexEntry = false; + hasLexEntry = hasLexEntryKey(formName, language); + if (hasLexEntry) + lexEntryKeys.add(formName); + if (formLemmas != null) { + for (int j=0; j<formLemmas.size(); j++) { + Lemma l = formLemmas.get(j); + String lName = l.getLemmaName(); + if (! hasLexEntry) { + hasLexEntry = hasLexEntryKey(lName, language); + } + if (! lName.equals(formName) && hasLexEntry) { + lexEntryKeys.add(lName); + } + } + } + if(lexEntryKeys.isEmpty()) + return null; + else + return lexEntryKeys; + } + + public boolean hasLexEntryKey(String formName, String language) throws ApplicationException { + boolean hasLexEntry = false; + ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLexicons(language); + if (statLexicons != null) { + for (int i=0; i<statLexicons.size(); i++) { + Lexicon lexicon = statLexicons.get(i).clone(); // clone without lexicon entries + LexiconEntry lexEntry = readEntry(lexicon.getName(), formName); + if (lexEntry != null) { + return true; + } + } + } + return hasLexEntry; + } + + public LexiconEntry readEntry(String lexiconName, String formName) throws ApplicationException { + LexiconEntry retLexEntry = null; + try { + String dbFoundValueStr = null; + String keyStr = formName; + DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8")); + Database lexDB = dbEnvLexica.getLexiconDB(lexiconName); + Cursor cursor = lexDB.openCursor(null, null); + DatabaseEntry foundValue = new DatabaseEntry(); + OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT); + if (operationStatus == OperationStatus.SUCCESS) { + byte[] foundValueBytes = foundValue.getData(); + dbFoundValueStr = new String(foundValueBytes, "utf-8"); + } + cursor.close(); + if (dbFoundValueStr != null) { + retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); + } + } catch (DatabaseException e) { + throw new ApplicationException(e); + } catch (UnsupportedEncodingException e) { + throw new ApplicationException(e); + } + return retLexEntry; + } + + public String transcode(String fromEncoding, String toEncoding, String inputStr) throws ApplicationException { + String encodedStr = null; + Transcoder transcoder = Transcoder.getInstance(); + if (fromEncoding.equals("buckwalter") && toEncoding.equals("unicode")) { + encodedStr = transcoder.transcodeFromBuckwalter2Unicode(inputStr); + } else if (fromEncoding.equals("betacode") && toEncoding.equals("unicode")) { + encodedStr = transcoder.transcodeFromBetaCode2Unicode(inputStr); + } + return encodedStr; + } + + public static void main(String[] args) throws ApplicationException { + getInstance(); + instance.beginOperation(); + System.out.print("Start ..."); + instance.readSampleData(); + instance.end(); + instance.endOperation(); + Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation); + System.out.println("End."); + System.out.println("Needed time: " + elapsedTime + " seconds"); + } + + private void initReadOnly() throws ApplicationException { + dbEnvLexica = new DbEnvLex(); + dbEnvLexica.setDataDir(DB_DIR_LEXICA); + dbEnvLexica.initReadOnly(); + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + dbEnvLexica.openDatabase(lexiconName); + } + } + + private void readSampleData() throws ApplicationException { + // List<String> dbNames = dbEnvLexica.getEnv().getDatabaseNames(); + String l1 = readEntry("autenrieth", "au)to/s").getContent(); // greek: see also bonitz and lsj + String l2 = readEntry("ls", "laudabilis").getContent(); // latin + System.out.println("Autenrieth: autos: " + l1); + System.out.println("Lewis & Short: Laudabilis: " + l2); + } + + private void end() throws ApplicationException { + ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(); + for (int i=0; i<lexicons.size(); i++) { + Lexicon lexicon = lexicons.get(i); + String lexiconName = lexicon.getName(); + dbEnvLexica.closeDatabase(lexiconName); + } + dbEnvLexica.close(); + } + + private void beginOperation() { + beginOfOperation = new Date(); + } + + private void endOperation() { + endOfOperation = new Date(); + } + +} \ No newline at end of file