Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/donatus/analysis/DonatusStemmer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.donatus.analysis; | |
2 | |
3 import org.apache.log4j.Logger; | |
4 | |
5 import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusCache; | |
6 import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusConstants; | |
7 import de.mpg.mpiwg.berlin.mpdl.donatus.xmlrpc.DonatusLemma; | |
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
9 | |
10 public class DonatusStemmer { | |
11 private String language = DonatusConstants.DEFAULT_LANGUAGE; | |
12 | |
13 protected void setLanguage(String language) { | |
14 this.language = language; | |
15 } | |
16 | |
17 /** | |
18 * Used for indexing documents and for querying | |
19 * @param term | |
20 * @return | |
21 */ | |
22 protected String stem(String term) { | |
23 String stem = null; | |
24 term = term.toLowerCase(); | |
25 // try to find the stem by the DonatusCache | |
26 DonatusLemma donatusLemma = null; | |
27 try { | |
28 DonatusCache donatusCache = DonatusCache.getInstance(); | |
29 donatusLemma = donatusCache.getLemmaByVariantForm(language, term); | |
30 } catch (ApplicationException e) { | |
31 // nothing, do not disturb | |
32 } | |
33 if (donatusLemma != null) | |
34 stem = donatusLemma.getForm(); | |
35 // if not found by Donatus try to use Snowball (or later other language specific stemmers) | |
36 if (stem == null) { | |
37 stem = stemBySnowball(term, language); | |
38 // if term is not equal to the base form and also the stem is not too short (> 2 characters) then add this Snowball variant to the lemmas in cache | |
39 if ((! stem.equals(term)) && stem.length() > 2) { | |
40 try { | |
41 DonatusCache donatusCache = DonatusCache.getInstance(); | |
42 if (donatusCache.getMode() == DonatusCache.DOCUMENT_MODE) { | |
43 donatusCache.addVariant(language, stem, DonatusConstants.TYPE_SNOWBALL, term); | |
44 } | |
45 } catch (ApplicationException e) { | |
46 Logger.getLogger(DonatusStemmer.class).warn("DonatusCache: an exception was caught while indexing a document: " + e.getMessage(), e); | |
47 } | |
48 } | |
49 } | |
50 /* TODO if Snowball is too bad (for some languages) use Lucene analyzers | |
51 if (stem == null) { | |
52 stem = stemByLanguageStemmers(term, this.language); | |
53 } | |
54 */ | |
55 return stem; | |
56 } | |
57 | |
58 private String stemBySnowball(String term, String language) { | |
59 String stem = null; | |
60 if (language.equals("de")) { | |
61 net.sf.snowball.ext.GermanStemmer stemmer = new net.sf.snowball.ext.GermanStemmer(); | |
62 stemmer.setCurrent(term); | |
63 stemmer.stem(); | |
64 stem = stemmer.getCurrent(); | |
65 } else if (language.equals("en")) { | |
66 net.sf.snowball.ext.EnglishStemmer stemmer = new net.sf.snowball.ext.EnglishStemmer(); | |
67 stemmer.setCurrent(term); | |
68 stemmer.stem(); | |
69 stem = stemmer.getCurrent(); | |
70 } else if (language.equals("nl")) { | |
71 net.sf.snowball.ext.DutchStemmer stemmer = new net.sf.snowball.ext.DutchStemmer(); | |
72 stemmer.setCurrent(term); | |
73 stemmer.stem(); | |
74 stem = stemmer.getCurrent(); | |
75 } else if (language.equals("fi")) { | |
76 net.sf.snowball.ext.FinnishStemmer stemmer = new net.sf.snowball.ext.FinnishStemmer(); | |
77 stemmer.setCurrent(term); | |
78 stemmer.stem(); | |
79 stem = stemmer.getCurrent(); | |
80 } else if (language.equals("fr")) { | |
81 net.sf.snowball.ext.FrenchStemmer stemmer = new net.sf.snowball.ext.FrenchStemmer(); | |
82 stemmer.setCurrent(term); | |
83 stemmer.stem(); | |
84 stem = stemmer.getCurrent(); | |
85 } else if (language.equals("it")) { | |
86 net.sf.snowball.ext.ItalianStemmer stemmer = new net.sf.snowball.ext.ItalianStemmer(); | |
87 stemmer.setCurrent(term); | |
88 stemmer.stem(); | |
89 stem = stemmer.getCurrent(); | |
90 } else if (language.equals("no")) { | |
91 net.sf.snowball.ext.NorwegianStemmer stemmer = new net.sf.snowball.ext.NorwegianStemmer(); | |
92 stemmer.setCurrent(term); | |
93 stemmer.stem(); | |
94 stem = stemmer.getCurrent(); | |
95 } else if (language.equals("pt")) { | |
96 net.sf.snowball.ext.PortugueseStemmer stemmer = new net.sf.snowball.ext.PortugueseStemmer(); | |
97 stemmer.setCurrent(term); | |
98 stemmer.stem(); | |
99 stem = stemmer.getCurrent(); | |
100 } else if (language.equals("ru")) { | |
101 net.sf.snowball.ext.RussianStemmer stemmer = new net.sf.snowball.ext.RussianStemmer(); | |
102 stemmer.setCurrent(term); | |
103 stemmer.stem(); | |
104 stem = stemmer.getCurrent(); | |
105 } else if (language.equals("es")) { | |
106 net.sf.snowball.ext.SpanishStemmer stemmer = new net.sf.snowball.ext.SpanishStemmer(); | |
107 stemmer.setCurrent(term); | |
108 stemmer.stem(); | |
109 stem = stemmer.getCurrent(); | |
110 } else if (language.equals("sv")) { | |
111 net.sf.snowball.ext.SwedishStemmer stemmer = new net.sf.snowball.ext.SwedishStemmer(); | |
112 stemmer.setCurrent(term); | |
113 stemmer.stem(); | |
114 stem = stemmer.getCurrent(); | |
115 } else { | |
116 stem = term; // if no language fits deliver the term itself as the stem form | |
117 } | |
118 return stem; | |
119 } | |
120 | |
121 /* | |
122 private String stemByLanguageStemmers(String term, String language) { | |
123 // TODO provide other languages | |
124 String stem = null; | |
125 if (language.equals("br")) { | |
126 BrazilianStemmer stemmer = new BrazilianStemmer(); | |
127 stem = stemmer.stem(term); | |
128 } else if (language.equals("de")) { | |
129 GermanStemmer stemmer = new GermanStemmer(); | |
130 stem = stemmer.stem(term); | |
131 } else if (language.equals("fr")) { | |
132 FrenchStemmer stemmer = new FrenchStemmer(); | |
133 stem = stemmer.stem(term); | |
134 } else if (language.equals("nl")) { | |
135 DutchStemmer stemmer = new DutchStemmer(); | |
136 stem = stemmer.stem(term); | |
137 } else if (language.equals("ru")) { | |
138 RussianStemmer stemmer = new RussianStemmer(); | |
139 stem = stemmer.stem(term); | |
140 } else { | |
141 stem = term; // if no language fits deliver the term itself as the stem form | |
142 } | |
143 return stem; | |
144 } | |
145 */ | |
146 } |