Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 59ff47d1e237 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,159 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.util.ArrayList; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; + +public class MpdlStemmer { + private String language = MpdlConstants.DEFAULT_LANGUAGE; + + protected void setLanguage(String language) { + this.language = language; + } + + /** + * Used for indexing documents and for querying + * @param term + * @return + */ + protected String stem(String term) { + String stem = null; + term = term.toLowerCase(); + // special case: term is already lemma: begins with "lemmalemma" + if (term.startsWith("lemmalemma")) + return term; + // try to find the stem by the MorphologyCache + ArrayList<Lemma> lemmas = null; + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + lemmas = morphologyCache.getLemmasByFormName(language, term, false); // do not normalize again, already done + } catch (ApplicationException e) { + // nothing, do not disturb + } + if (lemmas != null && ! lemmas.isEmpty()) { + if (lemmas.size() == 1) { + stem = lemmas.get(0).getLemmaName(); + } else { + stem = ""; + for (int i=0; i<lemmas.size(); i++) { + Lemma lemma = lemmas.get(i); + String lemmaName = lemma.getLemmaName(); + stem = stem + "+++" + lemmaName; // e.g. "+++edo+++sum" + } + } + } + // if not found in MorphologyCache use Snowball + if (stem == null) { + stem = stemBySnowball(term, language); + // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball form to the dynamic morphology cache + if ((! stem.equals(term)) && stem.length() > 2) { + try { + MorphologyCache morphologyCache = MorphologyCache.getInstance(); + if (morphologyCache.getMode() == MorphologyCache.DOCUMENT_MODE) { + Form newForm = new Form("snowball", language, term); + newForm.setLemmaName(stem); + morphologyCache.insertFormDynamic(newForm); + } + } catch (ApplicationException e) { + Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e); + } + } + } + return stem; + } + + private String stemBySnowball(String term, String language) { + String stem = null; + if (language.equals("de")) { + net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("en")) { + net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("nl")) { + net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fi")) { + net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fr")) { + net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("it")) { + net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("no")) { + net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("pt")) { + net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("ru")) { + net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("es")) { + net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("sv")) { + net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + + /* + private String stemByLanguageStemmers(String term, String language) { + // TODO provide other languages + String stem = null; + if (language.equals("br")) { + BrazilianStemmer stemmer = new BrazilianStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("de")) { + GermanStemmer stemmer = new GermanStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("fr")) { + FrenchStemmer stemmer = new FrenchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("nl")) { + DutchStemmer stemmer = new DutchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("ru")) { + RussianStemmer stemmer = new RussianStemmer(); + stem = stemmer.stem(term); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + */ +}