diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java	Wed Nov 24 17:24:23 2010 +0100
@@ -0,0 +1,146 @@
+package de.mpg.mpiwg.berlin.mpdl.donatus.analysis;
+
+import org.apache.log4j.Logger;
+
+import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache;
+import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants;
+import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class DonatusStemmer {
+  private String language = DonatusConstants.DEFAULT_LANGUAGE;
+
+  protected void setLanguage(String language) {
+    this.language = language;
+  }
+  
+  /**
+   * Used for indexing documents and for querying
+   * @param term
+   * @return
+   */
+  protected String stem(String term) {
+    String stem = null;
+    term = term.toLowerCase();
+    // try to find the stem by the DonatusCache
+    DonatusLemma donatusLemma = null; 
+    try {
+      DonatusCache donatusCache = DonatusCache.getInstance();
+      donatusLemma = donatusCache.getLemmaByVariantForm(language, term);
+    } catch (ApplicationException e) {
+      // nothing, do not disturb
+    }
+    if (donatusLemma != null)
+      stem = donatusLemma.getForm();
+    // if not found by Donatus try to use Snowball (or later other language specific stemmers)
+    if (stem == null) {
+      stem = stemBySnowball(term, language);
+      // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache
+      if ((! stem.equals(term)) && stem.length() > 2) {
+        try {
+          DonatusCache donatusCache = DonatusCache.getInstance();
+          if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) {
+            donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term);
+          }
+        } catch (ApplicationException e) {
+          Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e);
+        }
+      }
+    }
+    /* TODO if Snowball is too bad (for some languages) use Lucene analyzers
+    if (stem == null) {
+      stem = stemByLanguageStemmers(term, this.language);
+    }
+    */
+    return stem;
+  }
+
+  private String stemBySnowball(String term, String language) {
+    String stem = null;
+    if (language.equals("de")) {
+      net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer();
+      stemmer.setCurrent(term); 
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("en")) {
+      net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("nl")) {
+      net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("fi")) {
+      net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("fr")) {
+      net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("it")) {
+      net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("no")) {
+      net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("pt")) {
+      net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("ru")) {
+      net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("es")) {
+      net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else if (language.equals("sv")) {
+      net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer();
+      stemmer.setCurrent(term);
+      stemmer.stem();
+      stem = stemmer.getCurrent();
+    } else {
+      stem = term; // if no language fits deliver the term itself as the stem form
+    }
+    return stem;
+  }
+
+  /*
+  private String stemByLanguageStemmers(String term, String language) {
+    // TODO provide other languages
+    String stem = null;
+    if (language.equals("br")) {
+      BrazilianStemmer stemmer = new BrazilianStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("de")) {
+      GermanStemmer stemmer = new GermanStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("fr")) {
+      FrenchStemmer stemmer = new FrenchStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("nl")) {
+      DutchStemmer stemmer = new DutchStemmer();
+      stem = stemmer.stem(term);
+    } else if (language.equals("ru")) {
+      RussianStemmer stemmer = new RussianStemmer();
+      stem = stemmer.stem(term);
+    } else {
+      stem = term; // if no language fits deliver the term itself as the stem form
+    }
+    return stem;
+  }
+  */
+}