Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,146 @@ +package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; + +import org.apache.log4j.Logger; + +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants; +import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class DonatusStemmer { + private String language = DonatusConstants.DEFAULT_LANGUAGE; + + protected void setLanguage(String language) { + this.language = language; + } + + /** + * Used for indexing documents and for querying + * @param term + * @return + */ + protected String stem(String term) { + String stem = null; + term = term.toLowerCase(); + // try to find the stem by the DonatusCache + DonatusLemma donatusLemma = null; + try { + DonatusCache donatusCache = DonatusCache.getInstance(); + donatusLemma = donatusCache.getLemmaByVariantForm(language, term); + } catch (ApplicationException e) { + // nothing, do not disturb + } + if (donatusLemma != null) + stem = donatusLemma.getForm(); + // if not found by Donatus try to use Snowball (or later other language specific stemmers) + if (stem == null) { + stem = stemBySnowball(term, language); + // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache + if ((! stem.equals(term)) && stem.length() > 2) { + try { + DonatusCache donatusCache = DonatusCache.getInstance(); + if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) { + donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term); + } + } catch (ApplicationException e) { + Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e); + } + } + } + /* TODO if Snowball is too bad (for some languages) use Lucene analyzers + if (stem == null) { + stem = stemByLanguageStemmers(term, this.language); + } + */ + return stem; + } + + private String stemBySnowball(String term, String language) { + String stem = null; + if (language.equals("de")) { + net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("en")) { + net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("nl")) { + net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fi")) { + net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("fr")) { + net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("it")) { + net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("no")) { + net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("pt")) { + net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("ru")) { + net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("es")) { + net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else if (language.equals("sv")) { + net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); + stemmer.setCurrent(term); + stemmer.stem(); + stem = stemmer.getCurrent(); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + + /* + private String stemByLanguageStemmers(String term, String language) { + // TODO provide other languages + String stem = null; + if (language.equals("br")) { + BrazilianStemmer stemmer = new BrazilianStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("de")) { + GermanStemmer stemmer = new GermanStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("fr")) { + FrenchStemmer stemmer = new FrenchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("nl")) { + DutchStemmer stemmer = new DutchStemmer(); + stem = stemmer.stem(term); + } else if (language.equals("ru")) { + RussianStemmer stemmer = new RussianStemmer(); + stem = stemmer.stem(term); + } else { + stem = term; // if no language fits deliver the term itself as the stem form + } + return stem; + } + */ +}