view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 59ff47d1e237
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;

import java.util.ArrayList;

import org.apache.log4j.Logger;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;

public class MpdlStemmer {
  private String language = MpdlConstants.DEFAULT_LANGUAGE;

  protected void setLanguage(String language) {
    this.language = language;
  }
  
  /**
   * Used for indexing documents and for querying
   * @param term
   * @return
   */
  protected String stem(String term) {
    String stem = null;
    term = term.toLowerCase();
    // special case: term is already lemma: begins with "lemmalemma"
    if (term.startsWith("lemmalemma"))
      return term;
    // try to find the stem by the MorphologyCache
    ArrayList<Lemma> lemmas = null;
    try {
      MorphologyCache morphologyCache = MorphologyCache.getInstance();
      lemmas = morphologyCache.getLemmasByFormName(language, term, false);  // do not normalize again, already done
    } catch (ApplicationException e) {
      // nothing, do not disturb
    }
    if (lemmas != null && ! lemmas.isEmpty()) {
      if (lemmas.size() == 1) {
        stem = lemmas.get(0).getLemmaName();
      } else {
        stem = "";
        for (int i=0; i<lemmas.size(); i++) {
          Lemma lemma = lemmas.get(i);
          String lemmaName = lemma.getLemmaName();
          stem = stem + "+++" + lemmaName;  // e.g. "+++edo+++sum"
        }
      }
    }
    // if not found in MorphologyCache use Snowball
    if (stem == null) {
      stem = stemBySnowball(term, language); 
      // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball form to the dynamic morphology cache
      if ((! stem.equals(term)) && stem.length() > 2) {
        try {
          MorphologyCache morphologyCache = MorphologyCache.getInstance();
          if (morphologyCache.getMode() == MorphologyCache.DOCUMENT_MODE) {
            Form newForm = new Form("snowball", language, term);
            newForm.setLemmaName(stem);
            morphologyCache.insertFormDynamic(newForm);
          }
        } catch (ApplicationException e) {
          Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e);
        }
      }
    }
    return stem;
  }

  private String stemBySnowball(String term, String language) {
    String stem = null; 
    if (language.equals("de")) {
      net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer();
      stemmer.setCurrent(term); 
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("en")) {
      net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("nl")) {
      net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("fi")) {
      net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("fr")) {
      net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("it")) {
      net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("no")) {
      net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("pt")) {
      net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("ru")) {
      net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("es")) {
      net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else if (language.equals("sv")) {
      net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer();
      stemmer.setCurrent(term);
      stemmer.stem();
      stem = stemmer.getCurrent();
    } else {
      stem = term; // if no language fits deliver the term itself as the stem form
    }
    return stem;
  }

  /*
  private String stemByLanguageStemmers(String term, String language) {
    // TODO provide other languages
    String stem = null;
    if (language.equals("br")) {
      BrazilianStemmer stemmer = new BrazilianStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("de")) {
      GermanStemmer stemmer = new GermanStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("fr")) {
      FrenchStemmer stemmer = new FrenchStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("nl")) {
      DutchStemmer stemmer = new DutchStemmer();
      stem = stemmer.stem(term);
    } else if (language.equals("ru")) {
      RussianStemmer stemmer = new RussianStemmer();
      stem = stemmer.stem(term);
    } else {
      stem = term; // if no language fits deliver the term itself as the stem form
    }
    return stem;
  }
  */
}