Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 20:7d6d969b10cf
little corrections
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 14 Dec 2011 12:48:43 +0100 |
parents | 4a3641ae14d2 |
children | e845310098ba |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java Wed Nov 09 15:32:05 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java Wed Dec 14 12:48:43 2011 +0100 @@ -60,7 +60,7 @@ * @return lemmas * @throws ApplicationException */ - public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException { + public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode) throws ApplicationException { ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>(); // get lemmas of all forms in query MorphologyCache morphologyCache = MorphologyCache.getInstance(); @@ -69,21 +69,10 @@ String queryForm = queryForms[k]; ArrayList<Lemma> lemmas = null; if (type.equals("form")) { - if (normalization.equals("norm")) - lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); - else if (normalization.equals("none")) - lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false); - else - lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true); // TODO reg and reg+norm + lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode); } else if (type.equals("lemma")) { lemmas = new ArrayList<Lemma>(); - Lemma l = null; - if (normalization.equals("norm")) - l = morphologyCache.getLemma(language, queryForm, true); - else if (normalization.equals("none")) - l = morphologyCache.getLemma(language, queryForm, false); - else - l = morphologyCache.getLemma(language, queryForm, true); + Lemma l = morphologyCache.getLemma(language, queryForm, normMode); if (l != null) lemmas.add(l); } @@ -101,7 +90,7 @@ return lexLemmas; } - public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException { + public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException { ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>(); ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language); if (lexiconName != null) { @@ -115,19 +104,25 @@ Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries for (int j=0; j<lexLemmas.size(); j++) { String lemmaName = lexLemmas.get(j).getLemmaName(); - if (Language.getInstance().isGerman(language) && lemmaName.contains("ae")) - lemmaName = lemmaName.replaceAll("ae", "Š"); - if (Language.getInstance().isGerman(language) && lemmaName.contains("oe")) - lemmaName = lemmaName.replaceAll("oe", "š"); - if (Language.getInstance().isGerman(language) && lemmaName.contains("ue")) - lemmaName = lemmaName.replaceAll("ue", "Ÿ"); - if (Language.getInstance().isGerman(language) && lemmaName.contains("ss")) - lemmaName = lemmaName.replaceAll("ss", "§"); LexiconEntry lexEntry = getEntry(lexicon, lemmaName); if (lexEntry != null) { lexicon.addEntry(lexEntry); // add entries to the cloned lexicon } } + if (Language.getInstance().isGerman(language) && query != null) { + String[] lexFormNames = query.split(" "); + for (int j=0; j<lexFormNames.length; j++) { + String lexFormName = lexFormNames[j]; + LexiconEntry lexEntry = lexicon.getEntry(lexFormName); + if (lexEntry == null) { + LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null); + String lexiconQueryUrl = lexicon.getQueryUrl(); + String remoteUrl = lexiconQueryUrl + lexFormName; + newLexEntry.setRemoteUrl(remoteUrl); + lexicon.addEntry(newLexEntry); + } + } + } if (! lexicon.isEmpty()) retLexicons.add(lexicon); } @@ -143,10 +138,10 @@ * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form) * @throws ApplicationException */ - public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException { + public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException { ArrayList<String> lexEntryKeys = new ArrayList<String>(); MorphologyCache morphologyCache = MorphologyCache.getInstance(); - ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize); + ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode); boolean hasLexEntry = false; hasLexEntry = hasLexEntryKey(formName, language); if (hasLexEntry) @@ -158,7 +153,7 @@ if (! hasLexEntry) { hasLexEntry = hasLexEntryKey(lName, language); } - if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.) + if (language.equals("de") || language.equals("fr") || language.equals("nl")) // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.) lexEntryKeys.add(lName); if (! lName.equals(formName) && hasLexEntry) { lexEntryKeys.add(lName); @@ -188,8 +183,7 @@ return hasLexEntry; } - public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException { - int pageSize = 50; + public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { int from = (pageNumber * pageSize) - pageSize + 1; int to = pageNumber * pageSize; ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language); @@ -211,8 +205,7 @@ return retLexicons; } - public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException { - int pageSize = 50; + public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException { int from = (pageNumber * pageSize) - pageSize + 1; int to = pageNumber * pageSize; Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone(); @@ -265,6 +258,7 @@ cursor.close(); if (dbFoundValueStr != null) { retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr); + retLexEntry = correct(retLexEntry); // correct errors: e.g. in lsj some html entities are not correct } } catch (DatabaseException e) { throw new ApplicationException(e); @@ -292,6 +286,7 @@ byte[] foundKeyBytes = dbEntryKey.getData(); String dbFoundKeyStr = new String(foundKeyBytes, "utf-8"); LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr); + lexEntry = correct(lexEntry); // correct errors: e.g. in lsj some html entities are not correct retLexEntries.add(lexEntry); } operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT); @@ -309,6 +304,17 @@ return retLexEntries; } + private LexiconEntry correct(LexiconEntry lexEntry) { + String lexiconName = lexEntry.getLexiconName(); + String content = lexEntry.getContent(); + if (content != null && content.contains("&#") && lexiconName.equals("lsj")) { // errors in greek lexicon lsj + content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x" + content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear + lexEntry.setContent(content); + } + return lexEntry; + } + public static void main(String[] args) throws ApplicationException { getInstance(); instance.beginOperation();