Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 257f67be5c00 |
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java Tue Feb 08 14:54:09 2011 +0100 @@ -16,6 +16,7 @@ public class MpdlTokenizerAnalyzer extends Analyzer { protected String language = MpdlConstants.DEFAULT_LANGUAGE; protected MpdlNormalizer normalizer = null; + private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon public MpdlTokenizerAnalyzer(String language) { this.language = language; @@ -27,8 +28,18 @@ this.normalizer = normalizer; } + public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { + this.regWithoutSemicolon = regWithoutSemicolon; + } + + public boolean isRegWithoutSemicolon() { + return regWithoutSemicolon; + } + public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new MpdlTokenizer(reader, language, normalizer); + MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); + tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later + TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); return result; @@ -38,7 +49,9 @@ ArrayList<Token> token = new ArrayList<Token>(); try { Reader reader = new StringReader(inputString); - TokenStream result = new MpdlTokenizer(reader, language, normalizer); + MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); + tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later + TokenStream result = (TokenStream) tmpTokenizer; result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. result = new LowerCaseFilter(result); Token t = result.next();