Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Wed Nov 24 17:24:23 2010 +0100 @@ -0,0 +1,55 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; + +public class MpdlTokenizerAnalyzer extends Analyzer { + protected String language = MpdlConstants.DEFAULT_LANGUAGE; + protected MpdlNormalizer normalizer = null; + + public MpdlTokenizerAnalyzer(String language) { + this.language = language; + this.normalizer = new MpdlNormalizer(language); // default normalizer + } + + public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { + this.language = language; + this.normalizer = normalizer; + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new MpdlTokenizer(reader, language, normalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + return result; + } + + public ArrayList<Token> getToken(String inputString) throws ApplicationException { + ArrayList<Token> token = new ArrayList<Token>(); + try { + Reader reader = new StringReader(inputString); + TokenStream result = new MpdlTokenizer(reader, language, normalizer); + result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. + result = new LowerCaseFilter(result); + Token t = result.next(); + while (t != null) { + token.add(t); + t = result.next(); + } + } catch (IOException e) { + throw new ApplicationException(e); + } + return token; + } + +}