view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.donatus.analysis;

import org.apache.log4j.Logger;

import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache;
import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants;
import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

public class DonatusStemmer {
  private String language = DonatusConstants.DEFAULT_LANGUAGE;

  protected void setLanguage(String language) {
    this.language = language;
  }
  
  /**
   * Used for indexing documents and for querying
   * @param term
   * @return
   */
  protected String stem(String term) {
    String stem = null;
    term = term.toLowerCase();
    // try to find the stem by the DonatusCache
    DonatusLemma donatusLemma = null; 
    try {
      DonatusCache donatusCache = DonatusCache.getInstance();
      donatusLemma = donatusCache.getLemmaByVariantForm(language, term);
    } catch (ApplicationException e) {
      // nothing, do not disturb
    }
    if (donatusLemma != null)
      stem = donatusLemma.getForm();
    // if not found by Donatus try to use Snowball (or later other language specific stemmers)
    if (stem == null) {
      stem = stemBySnowball(term, language);
      // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache
      if ((! stem.equals(term)) && stem.length() > 2) {
        try {
          DonatusCache donatusCache = DonatusCache.getInstance();
          if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) {
            donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term);
          }
        } catch (ApplicationException e) {
          Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e);
        }
      }
    }
    /* TODO if Snowball is too bad (for some languages) use Lucene analyzers
    if (stem == null) {
      stem = stemByLanguageStemmers(term, this.language);
    }
    */
    return stem;
  }

  private String stemBySnowball(String term, String language) {
    String stem = null;
    if (language.equals("de")) {
      net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer();
      stemmer.setCurrent(term); 
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("en")) {
      net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("nl")) {
      net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("fi")) {
      net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("fr")) {
      net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("it")) {
      net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("no")) {
      net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("pt")) {
      net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("ru")) {
      net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("es")) {
      net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("sv")) {
      net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else {
      stem = term; // if no language fits deliver the term itself as the stem form
    }
    return stem;
  }

  /*
  private String stemByLanguageStemmers(String term, String language) {
    // TODO provide other languages
    String stem = null;
    if (language.equals("br")) {
      BrazilianStemmer stemmer = new BrazilianStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("de")) {
      GermanStemmer stemmer = new GermanStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("fr")) {
      FrenchStemmer stemmer = new FrenchStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("nl")) {
      DutchStemmer stemmer = new DutchStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("ru")) {
      RussianStemmer stemmer = new RussianStemmer();
      stem = stemmer.stem(term);
    } else {
      stem = term; // if no language fits deliver the term itself as the stem form
    }
    return stem;
  }
  */
}