Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlStemmer.java @ 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 11 Mar 2011 13:33:26 +0100 |
parents | 408254cf2f1d |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import java.util.ArrayList; import org.apache.log4j.Logger; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; public class MpdlStemmer { private String language = MpdlConstants.DEFAULT_LANGUAGE; protected void setLanguage(String language) { this.language = language; } /** * Used for indexing documents and for querying * @param term * @return */ protected String stem(String term) { String stem = null; term = term.toLowerCase(); // special case: term is already lemma: begins with "lemmalemma" if (term.startsWith("lemmalemma")) return term; // try to find the stem by the MorphologyCache ArrayList<Lemma> lemmas = null; try { MorphologyCache morphologyCache = MorphologyCache.getInstance(); lemmas = morphologyCache.getLemmasByFormName(language, term, false); // do not normalize again, already done } catch (ApplicationException e) { // nothing, do not disturb } if (lemmas != null && ! lemmas.isEmpty()) { if (lemmas.size() == 1) { stem = lemmas.get(0).getLemmaName(); } else { stem = ""; for (int i=0; i<lemmas.size(); i++) { Lemma lemma = lemmas.get(i); String lemmaName = lemma.getLemmaName(); stem = stem + "+++" + lemmaName; // e.g. "+++edo+++sum" } } } // if not found then use the term itself as the stem if (stem == null) { stem = term; /* Snowball stemming: if not found in MorphologyCache use Snowball stem = stemBySnowball(term, language); // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball form to the dynamic morphology cache if ((! stem.equals(term)) && stem.length() > 2) { try { MorphologyCache morphologyCache = MorphologyCache.getInstance(); if (morphologyCache.getMode() == MorphologyCache.DOCUMENT_MODE) { Form newForm = new Form("snowball", language, term); newForm.setLemmaName(stem); morphologyCache.insertFormDynamic(newForm); } } catch (ApplicationException e) { Logger.getLogger(MpdlStemmer.class).warn("MorphologyCache: an exception was caught while indexing a document: " + e.getMessage(), e); } } */ } return stem; } private String stemBySnowball(String term, String language) { String stem = null; if (language.equals("de")) { net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("en")) { net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("nl")) { net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("fi")) { net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("fr")) { net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("it")) { net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("no")) { net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("pt")) { net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("ru")) { net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("es")) { net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("sv")) { net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else { stem = term; // if no language fits deliver the term itself as the stem form } return stem; } /* private String stemByLanguageStemmers(String term, String language) { // TODO provide other languages String stem = null; if (language.equals("br")) { BrazilianStemmer stemmer = new BrazilianStemmer(); stem = stemmer.stem(term); } else if (language.equals("de")) { GermanStemmer stemmer = new GermanStemmer(); stem = stemmer.stem(term); } else if (language.equals("fr")) { FrenchStemmer stemmer = new FrenchStemmer(); stem = stemmer.stem(term); } else if (language.equals("nl")) { DutchStemmer stemmer = new DutchStemmer(); stem = stemmer.stem(term); } else if (language.equals("ru")) { RussianStemmer stemmer = new RussianStemmer(); stem = stemmer.stem(term); } else { stem = term; // if no language fits deliver the term itself as the stem form } return stem; } */ }