diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 59ff47d1e237
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,159 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
+
+import java.util.ArrayList;
+
+import org.apache.log4j.Logger;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
+import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
+
+public class MpdlStemmer {
+  private String language = MpdlConstants.DEFAULT_LANGUAGE;
+
+  protected void setLanguage(String language) {
+    this.language = language;
+  }
+  
+  /**
+   * Used for indexing documents and for querying
+   * @param term
+   * @return
+   */
+  protected String stem(String term) {
+    String stem = null;
+    term = term.toLowerCase();
+    // special case: term is already lemma: begins with "lemmalemma"
+    if (term.startsWith("lemmalemma"))
+      return term;
+    // try to find the stem by the MorphologyCache
+    ArrayList<Lemma> lemmas = null;
+    try {
+      MorphologyCache morphologyCache = MorphologyCache.getInstance();
+      lemmas = morphologyCache.getLemmasByFormName(language, term, false);  // do not normalize again, already done
+    } catch (ApplicationException e) {
+      // nothing, do not disturb
+    }
+    if (lemmas != null && ! lemmas.isEmpty()) {
+      if (lemmas.size() == 1) {
+        stem = lemmas.get(0).getLemmaName();
+      } else {
+        stem = "";
+        for (int i=0; i<lemmas.size(); i++) {
+          Lemma lemma = lemmas.get(i);
+          String lemmaName = lemma.getLemmaName();
+          stem = stem + "+++" + lemmaName;  // e.g. "+++edo+++sum"
+        }
+      }
+    }
+    // if not found in MorphologyCache use Snowball
+    if (stem == null) {
+      stem = stemBySnowball(term, language); 
+      // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball form to the dynamic morphology cache
+      if ((! stem.equals(term)) && stem.length() > 2) {
+        try {
+          MorphologyCache morphologyCache = MorphologyCache.getInstance();
+          if (morphologyCache.getMode() == MorphologyCache.DOCUMENT_MODE) {
+            Form newForm = new Form("snowball", language, term);
+            newForm.setLemmaName(stem);
+            morphologyCache.insertFormDynamic(newForm);
+          }
+        } catch (ApplicationException e) {
+          Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e);
+        }
+      }
+    }
+    return stem;
+  }
+
+  private String stemBySnowball(String term, String language) {
+    String stem = null; 
+    if (language.equals("de")) {
+      net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer();
+      stemmer.setCurrent(term); 
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("en")) {
+      net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("nl")) {
+      net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("fi")) {
+      net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("fr")) {
+      net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("it")) {
+      net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("no")) {
+      net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("pt")) {
+      net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("ru")) {
+      net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("es")) {
+      net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("sv")) {
+      net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else {
+      stem = term; // if no language fits deliver the term itself as the stem form
+    }
+    return stem;
+  }
+
+  /*
+  private String stemByLanguageStemmers(String term, String language) {
+    // TODO provide other languages
+    String stem = null;
+    if (language.equals("br")) {
+      BrazilianStemmer stemmer = new BrazilianStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("de")) {
+      GermanStemmer stemmer = new GermanStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("fr")) {
+      FrenchStemmer stemmer = new FrenchStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("nl")) {
+      DutchStemmer stemmer = new DutchStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("ru")) {
+      RussianStemmer stemmer = new RussianStemmer();
+      stem = stemmer.stem(term);
+    } else {
+      stem = term; // if no language fits deliver the term itself as the stem form
+    }
+    return stem;
+  }
+  */
+}