Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; public class MpdlTokenizerAnalyzer extends Analyzer { protected String language = MpdlConstants.DEFAULT_LANGUAGE; protected MpdlNormalizer normalizer = null; public MpdlTokenizerAnalyzer(String language) { this.language = language; this.normalizer = new MpdlNormalizer(language); // default normalizer } public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { this.language = language; this.normalizer = normalizer; } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new MpdlTokenizer(reader, language, normalizer); result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); return result; } public ArrayList<Token> getToken(String inputString) throws ApplicationException { ArrayList<Token> token = new ArrayList<Token>(); try { Reader reader = new StringReader(inputString); TokenStream result = new MpdlTokenizer(reader, language, normalizer); result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); Token t = result.next(); while (t != null) { token.add(t); t = result.next(); } } catch (IOException e) { throw new ApplicationException(e); } return token; } }