comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 2396a569e446
children
comparison
equal deleted inserted replaced
15:e99964f390e4 16:257f67be5c00
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; 14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
15 15
16 public class MpdlTokenizerAnalyzer extends Analyzer { 16 public class MpdlTokenizerAnalyzer extends Analyzer {
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; 17 protected String language = MpdlConstants.DEFAULT_LANGUAGE;
18 protected MpdlNormalizer normalizer = null; 18 protected MpdlNormalizer normalizer = null;
19 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
20 19
21 public MpdlTokenizerAnalyzer(String language) { 20 public MpdlTokenizerAnalyzer(String language) {
22 this.language = language; 21 this.language = language;
23 this.normalizer = new MpdlNormalizer(language); // default normalizer 22 this.normalizer = new MpdlNormalizer(language); // default normalizer
24 } 23 }
26 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { 25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) {
27 this.language = language; 26 this.language = language;
28 this.normalizer = normalizer; 27 this.normalizer = normalizer;
29 } 28 }
30 29
31 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
32 this.regWithoutSemicolon = regWithoutSemicolon;
33 }
34
35 public boolean isRegWithoutSemicolon() {
36 return regWithoutSemicolon;
37 }
38
39 public TokenStream tokenStream(String fieldName, Reader reader) { 30 public TokenStream tokenStream(String fieldName, Reader reader) {
40 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); 31 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
41 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
42 TokenStream result = (TokenStream) tmpTokenizer; 32 TokenStream result = (TokenStream) tmpTokenizer;
43 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. 33 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
44 result = new LowerCaseFilter(result); 34 result = new LowerCaseFilter(result);
45 return result; 35 return result;
46 } 36 }
48 public ArrayList<Token> getToken(String inputString) throws ApplicationException { 38 public ArrayList<Token> getToken(String inputString) throws ApplicationException {
49 ArrayList<Token> token = new ArrayList<Token>(); 39 ArrayList<Token> token = new ArrayList<Token>();
50 try { 40 try {
51 Reader reader = new StringReader(inputString); 41 Reader reader = new StringReader(inputString);
52 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); 42 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
53 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later
54 TokenStream result = (TokenStream) tmpTokenizer; 43 TokenStream result = (TokenStream) tmpTokenizer;
55 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. 44 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
56 result = new LowerCaseFilter(result); 45 result = new LowerCaseFilter(result);
57 Token t = result.next(); 46 Token t = result.next();
58 while (t != null) { 47 while (t != null) {