diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java @ 20:7d6d969b10cf

little corrections
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 14 Dec 2011 12:48:43 +0100
parents 4a3641ae14d2
children e845310098ba
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java	Wed Nov 09 15:32:05 2011 +0100
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/dict/db/LexHandler.java	Wed Dec 14 12:48:43 2011 +0100
@@ -60,7 +60,7 @@
    * @return lemmas
    * @throws ApplicationException
    */
-  public ArrayList<Lemma> getLemmas(String query, String type, String language, String normalization) throws ApplicationException {
+  public ArrayList<Lemma> getLemmas(String query, String type, String language, int normMode) throws ApplicationException {
     ArrayList<Lemma> lexLemmas = new ArrayList<Lemma>();
     // get lemmas of all forms in query
     MorphologyCache morphologyCache = MorphologyCache.getInstance();
@@ -69,21 +69,10 @@
       String queryForm = queryForms[k];
       ArrayList<Lemma> lemmas = null;
       if (type.equals("form")) {
-        if (normalization.equals("norm"))
-          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true);
-        else if (normalization.equals("none"))
-          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, false);
-        else 
-          lemmas = morphologyCache.getLemmasByFormName(language, queryForm, true);  // TODO reg and reg+norm
+        lemmas = morphologyCache.getLemmasByFormName(language, queryForm, normMode);
       } else if (type.equals("lemma")) {
         lemmas = new ArrayList<Lemma>();
-        Lemma l = null;
-        if (normalization.equals("norm"))
-          l = morphologyCache.getLemma(language, queryForm, true);
-        else if (normalization.equals("none"))
-          l = morphologyCache.getLemma(language, queryForm, false);
-        else 
-          l = morphologyCache.getLemma(language, queryForm, true);
+        Lemma l = morphologyCache.getLemma(language, queryForm, normMode);
         if (l != null)
           lemmas.add(l);
       }
@@ -101,7 +90,7 @@
       return lexLemmas;
   }
 
-  public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName) throws ApplicationException {
+  public ArrayList<Lexicon> getLexEntries(ArrayList<Lemma> lexLemmas, String language, String lexiconName, String query) throws ApplicationException {
     ArrayList<Lexicon> retLexicons = new ArrayList<Lexicon>();
     ArrayList<Lexicon> lexicons = Lexica.getInstance().getLexicons(language);
     if (lexiconName != null) {
@@ -115,19 +104,25 @@
         Lexicon lexicon = lexicons.get(i).clone(); // clone without lexicon entries
         for (int j=0; j<lexLemmas.size(); j++) {
           String lemmaName = lexLemmas.get(j).getLemmaName();
-          if (Language.getInstance().isGerman(language) && lemmaName.contains("ae"))
-            lemmaName = lemmaName.replaceAll("ae", "Š");
-          if (Language.getInstance().isGerman(language) && lemmaName.contains("oe"))
-            lemmaName = lemmaName.replaceAll("oe", "š");
-          if (Language.getInstance().isGerman(language) && lemmaName.contains("ue"))
-            lemmaName = lemmaName.replaceAll("ue", "Ÿ");
-          if (Language.getInstance().isGerman(language) && lemmaName.contains("ss"))
-            lemmaName = lemmaName.replaceAll("ss", "§");
           LexiconEntry lexEntry = getEntry(lexicon, lemmaName);
           if (lexEntry != null) {
             lexicon.addEntry(lexEntry); // add entries to the cloned lexicon
           }
         }
+        if (Language.getInstance().isGerman(language) && query != null) {
+          String[] lexFormNames = query.split(" ");
+          for (int j=0; j<lexFormNames.length; j++) {
+            String lexFormName = lexFormNames[j];          
+            LexiconEntry lexEntry = lexicon.getEntry(lexFormName);
+            if (lexEntry == null) {
+              LexiconEntry newLexEntry = new LexiconEntry(lexiconName, lexFormName, null);
+              String lexiconQueryUrl = lexicon.getQueryUrl();
+              String remoteUrl = lexiconQueryUrl + lexFormName;
+              newLexEntry.setRemoteUrl(remoteUrl);
+              lexicon.addEntry(newLexEntry);
+            }
+          }
+        }
         if (! lexicon.isEmpty())
           retLexicons.add(lexicon);
       }
@@ -143,10 +138,10 @@
    * @return delivers lexical entries by the help of the morphology component (lexical entry of the stem of the normalized word form)
    * @throws ApplicationException
    */
-  public ArrayList<String> getLexEntryKeys(String formName, String language, boolean normalize) throws ApplicationException {
+  public ArrayList<String> getLexEntryKeys(String formName, String language, int normMode) throws ApplicationException {
     ArrayList<String> lexEntryKeys = new ArrayList<String>();
     MorphologyCache morphologyCache = MorphologyCache.getInstance();
-    ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normalize);
+    ArrayList<Lemma> formLemmas = morphologyCache.getLemmasByFormName(language, formName, normMode);
     boolean hasLexEntry = false;
     hasLexEntry = hasLexEntryKey(formName, language);
     if (hasLexEntry)
@@ -158,7 +153,7 @@
         if (! hasLexEntry) {
           hasLexEntry = hasLexEntryKey(lName, language);
         }
-        if (language.equals("de") || language.equals("fr") || language.equals("nl"))   // TODO Lexika fŸr diese Sprachen in BerkeleyDB einbringen (fŸr nl auch eine bessere Morph.)
+        if (language.equals("de") || language.equals("fr") || language.equals("nl"))   // TODO Lexika für diese Sprachen in BerkeleyDB einbringen (für nl auch eine bessere Morph.)
           lexEntryKeys.add(lName);
         if (! lName.equals(formName) && hasLexEntry) {
           lexEntryKeys.add(lName);
@@ -188,8 +183,7 @@
     return hasLexEntry;
   }
   
-  public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber) throws ApplicationException {
-    int pageSize = 50;
+  public ArrayList<Lexicon> getLexEntriesBeginningWith(String language, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
     int from = (pageNumber * pageSize) - pageSize + 1;
     int to = pageNumber * pageSize;
     ArrayList<Lexicon> statLexicons = Lexica.getInstance().getLocalLexicons(language);
@@ -211,8 +205,7 @@
     return retLexicons;
   }
   
-  public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber) throws ApplicationException {
-    int pageSize = 50;
+  public ArrayList<Lexicon> getLexEntriesByLexiconBeginningWith(String lexiconName, String formPrefix, int pageNumber, int pageSize) throws ApplicationException {
     int from = (pageNumber * pageSize) - pageSize + 1;
     int to = pageNumber * pageSize;
     Lexicon lexicon = Lexica.getInstance().getLexicon(lexiconName).clone();
@@ -265,6 +258,7 @@
       cursor.close();
       if (dbFoundValueStr != null) {
         retLexEntry = new LexiconEntry(lexiconName, formName, dbFoundValueStr);
+        retLexEntry = correct(retLexEntry);  // correct errors: e.g. in lsj some html entities are not correct
       }
     } catch (DatabaseException e) {
       throw new ApplicationException(e);
@@ -292,6 +286,7 @@
           byte[] foundKeyBytes = dbEntryKey.getData();
           String dbFoundKeyStr = new String(foundKeyBytes, "utf-8");
           LexiconEntry lexEntry = new LexiconEntry(lexiconName, dbFoundKeyStr, dbFoundValueStr);
+          lexEntry = correct(lexEntry);  // correct errors: e.g. in lsj some html entities are not correct
           retLexEntries.add(lexEntry);
         }
         operationStatus = cursor.getNext(dbEntryKey, foundValue, LockMode.DEFAULT);
@@ -309,6 +304,17 @@
     return retLexEntries;
   }
 
+  private LexiconEntry correct(LexiconEntry lexEntry) {
+    String lexiconName = lexEntry.getLexiconName();
+    String content = lexEntry.getContent();
+    if (content != null && content.contains("&#") && lexiconName.equals("lsj")) {  // errors in greek lexicon lsj
+      content = content.replaceAll("&#\u03C7", "&#x"); // html entity: replace greek Minuskel Chi by "x"
+      content = content.replaceAll("&#x[^0-9]{4};", ""); // html entity: remove entity if special greek not hex characters appear
+      lexEntry.setContent(content);
+    }
+    return lexEntry;
+  }
+
   public static void main(String[] args) throws ApplicationException {
     getInstance();
     instance.beginOperation();