Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; import org.apache.log4j.Logger; import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache; import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants; import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; public class DonatusStemmer { private String language = DonatusConstants.DEFAULT_LANGUAGE; protected void setLanguage(String language) { this.language = language; } /** * Used for indexing documents and for querying * @param term * @return */ protected String stem(String term) { String stem = null; term = term.toLowerCase(); // try to find the stem by the DonatusCache DonatusLemma donatusLemma = null; try { DonatusCache donatusCache = DonatusCache.getInstance(); donatusLemma = donatusCache.getLemmaByVariantForm(language, term); } catch (ApplicationException e) { // nothing, do not disturb } if (donatusLemma != null) stem = donatusLemma.getForm(); // if not found by Donatus try to use Snowball (or later other language specific stemmers) if (stem == null) { stem = stemBySnowball(term, language); // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache if ((! stem.equals(term)) && stem.length() > 2) { try { DonatusCache donatusCache = DonatusCache.getInstance(); if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) { donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term); } } catch (ApplicationException e) { Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e); } } } /* TODO if Snowball is too bad (for some languages) use Lucene analyzers if (stem == null) { stem = stemByLanguageStemmers(term, this.language); } */ return stem; } private String stemBySnowball(String term, String language) { String stem = null; if (language.equals("de")) { net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("en")) { net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("nl")) { net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("fi")) { net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("fr")) { net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("it")) { net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("no")) { net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("pt")) { net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("ru")) { net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("es")) { net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else if (language.equals("sv")) { net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); stemmer.setCurrent(term); stemmer.stem(); stem = stemmer.getCurrent(); } else { stem = term; // if no language fits deliver the term itself as the stem form } return stem; } /* private String stemByLanguageStemmers(String term, String language) { // TODO provide other languages String stem = null; if (language.equals("br")) { BrazilianStemmer stemmer = new BrazilianStemmer(); stem = stemmer.stem(term); } else if (language.equals("de")) { GermanStemmer stemmer = new GermanStemmer(); stem = stemmer.stem(term); } else if (language.equals("fr")) { FrenchStemmer stemmer = new FrenchStemmer(); stem = stemmer.stem(term); } else if (language.equals("nl")) { DutchStemmer stemmer = new DutchStemmer(); stem = stemmer.stem(term); } else if (language.equals("ru")) { RussianStemmer stemmer = new RussianStemmer(); stem = stemmer.stem(term); } else { stem = term; // if no language fits deliver the term itself as the stem form } return stem; } */ }