Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 257f67be5c00 |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; public class MpdlTokenizerAnalyzer extends Analyzer { protected String language = MpdlConstants.DEFAULT_LANGUAGE; protected MpdlNormalizer normalizer = null; private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon public MpdlTokenizerAnalyzer(String language) { this.language = language; this.normalizer = new MpdlNormalizer(language); // default normalizer } public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { this.language = language; this.normalizer = normalizer; } public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { this.regWithoutSemicolon = regWithoutSemicolon; } public boolean isRegWithoutSemicolon() { return regWithoutSemicolon; } public TokenStream tokenStream(String fieldName, Reader reader) { MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); return result; } public ArrayList<Token> getToken(String inputString) throws ApplicationException { ArrayList<Token> token = new ArrayList<Token>(); try { Reader reader = new StringReader(inputString); MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); Token t = result.next(); while (t != null) { token.add(t); t = result.next(); } } catch (IOException e) { throw new ApplicationException(e); } return token; } }