Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 257f67be5c00 |
comparison
equal
deleted
inserted
replaced
5:94305c504178 | 6:2396a569e446 |
---|---|
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; | 14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; |
15 | 15 |
16 public class MpdlTokenizerAnalyzer extends Analyzer { | 16 public class MpdlTokenizerAnalyzer extends Analyzer { |
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; | 17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; |
18 protected MpdlNormalizer normalizer = null; | 18 protected MpdlNormalizer normalizer = null; |
19 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon | |
19 | 20 |
20 public MpdlTokenizerAnalyzer(String language) { | 21 public MpdlTokenizerAnalyzer(String language) { |
21 this.language = language; | 22 this.language = language; |
22 this.normalizer = new MpdlNormalizer(language); // default normalizer | 23 this.normalizer = new MpdlNormalizer(language); // default normalizer |
23 } | 24 } |
25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { | 26 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { |
26 this.language = language; | 27 this.language = language; |
27 this.normalizer = normalizer; | 28 this.normalizer = normalizer; |
28 } | 29 } |
29 | 30 |
31 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { | |
32 this.regWithoutSemicolon = regWithoutSemicolon; | |
33 } | |
34 | |
35 public boolean isRegWithoutSemicolon() { | |
36 return regWithoutSemicolon; | |
37 } | |
38 | |
30 public TokenStream tokenStream(String fieldName, Reader reader) { | 39 public TokenStream tokenStream(String fieldName, Reader reader) { |
31 TokenStream result = new MpdlTokenizer(reader, language, normalizer); | 40 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); |
41 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later | |
42 TokenStream result = (TokenStream) tmpTokenizer; | |
32 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | 43 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. |
33 result = new LowerCaseFilter(result); | 44 result = new LowerCaseFilter(result); |
34 return result; | 45 return result; |
35 } | 46 } |
36 | 47 |
37 public ArrayList<Token> getToken(String inputString) throws ApplicationException { | 48 public ArrayList<Token> getToken(String inputString) throws ApplicationException { |
38 ArrayList<Token> token = new ArrayList<Token>(); | 49 ArrayList<Token> token = new ArrayList<Token>(); |
39 try { | 50 try { |
40 Reader reader = new StringReader(inputString); | 51 Reader reader = new StringReader(inputString); |
41 TokenStream result = new MpdlTokenizer(reader, language, normalizer); | 52 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); |
53 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later | |
54 TokenStream result = (TokenStream) tmpTokenizer; | |
42 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | 55 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. |
43 result = new LowerCaseFilter(result); | 56 result = new LowerCaseFilter(result); |
44 Token t = result.next(); | 57 Token t = result.next(); |
45 while (t != null) { | 58 while (t != null) { |
46 token.add(t); | 59 token.add(t); |