comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children 257f67be5c00
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; 14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
15 15
16 public class MpdlTokenizerAnalyzer extends Analyzer { 16 public class MpdlTokenizerAnalyzer extends Analyzer {
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; 17 protected String language = MpdlConstants.DEFAULT_LANGUAGE;
18 protected MpdlNormalizer normalizer = null; 18 protected MpdlNormalizer normalizer = null;
19 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
19 20
20 public MpdlTokenizerAnalyzer(String language) { 21 public MpdlTokenizerAnalyzer(String language) {
21 this.language = language; 22 this.language = language;
22 this.normalizer = new MpdlNormalizer(language); // default normalizer 23 this.normalizer = new MpdlNormalizer(language); // default normalizer
23 } 24 }
25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { 26 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) {
26 this.language = language; 27 this.language = language;
27 this.normalizer = normalizer; 28 this.normalizer = normalizer;
28 } 29 }
29 30
31 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
32 this.regWithoutSemicolon = regWithoutSemicolon;
33 }
34
35 public boolean isRegWithoutSemicolon() {
36 return regWithoutSemicolon;
37 }
38
30 public TokenStream tokenStream(String fieldName, Reader reader) { 39 public TokenStream tokenStream(String fieldName, Reader reader) {
31 TokenStream result = new MpdlTokenizer(reader, language, normalizer); 40 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
41 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
42 TokenStream result = (TokenStream) tmpTokenizer;
32 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. 43 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
33 result = new LowerCaseFilter(result); 44 result = new LowerCaseFilter(result);
34 return result; 45 return result;
35 } 46 }
36 47
37 public ArrayList<Token> getToken(String inputString) throws ApplicationException { 48 public ArrayList<Token> getToken(String inputString) throws ApplicationException {
38 ArrayList<Token> token = new ArrayList<Token>(); 49 ArrayList<Token> token = new ArrayList<Token>();
39 try { 50 try {
40 Reader reader = new StringReader(inputString); 51 Reader reader = new StringReader(inputString);
41 TokenStream result = new MpdlTokenizer(reader, language, normalizer); 52 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
53 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
54 TokenStream result = (TokenStream) tmpTokenizer;
42 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. 55 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
43 result = new LowerCaseFilter(result); 56 result = new LowerCaseFilter(result);
44 Token t = result.next(); 57 Token t = result.next();
45 while (t != null) { 58 while (t != null) {
46 token.add(t); 59 token.add(t);