Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 16:257f67be5c00
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Sep 2011 16:40:57 +0200 |
parents | 2396a569e446 |
children |
comparison
equal
deleted
inserted
replaced
15:e99964f390e4 | 16:257f67be5c00 |
---|---|
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; | 14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; |
15 | 15 |
16 public class MpdlTokenizerAnalyzer extends Analyzer { | 16 public class MpdlTokenizerAnalyzer extends Analyzer { |
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; | 17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; |
18 protected MpdlNormalizer normalizer = null; | 18 protected MpdlNormalizer normalizer = null; |
19 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon | |
20 | 19 |
21 public MpdlTokenizerAnalyzer(String language) { | 20 public MpdlTokenizerAnalyzer(String language) { |
22 this.language = language; | 21 this.language = language; |
23 this.normalizer = new MpdlNormalizer(language); // default normalizer | 22 this.normalizer = new MpdlNormalizer(language); // default normalizer |
24 } | 23 } |
26 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { | 25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { |
27 this.language = language; | 26 this.language = language; |
28 this.normalizer = normalizer; | 27 this.normalizer = normalizer; |
29 } | 28 } |
30 | 29 |
31 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { | |
32 this.regWithoutSemicolon = regWithoutSemicolon; | |
33 } | |
34 | |
35 public boolean isRegWithoutSemicolon() { | |
36 return regWithoutSemicolon; | |
37 } | |
38 | |
39 public TokenStream tokenStream(String fieldName, Reader reader) { | 30 public TokenStream tokenStream(String fieldName, Reader reader) { |
40 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); | 31 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); |
41 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later | |
42 TokenStream result = (TokenStream) tmpTokenizer; | 32 TokenStream result = (TokenStream) tmpTokenizer; |
43 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | 33 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. |
44 result = new LowerCaseFilter(result); | 34 result = new LowerCaseFilter(result); |
45 return result; | 35 return result; |
46 } | 36 } |
48 public ArrayList<Token> getToken(String inputString) throws ApplicationException { | 38 public ArrayList<Token> getToken(String inputString) throws ApplicationException { |
49 ArrayList<Token> token = new ArrayList<Token>(); | 39 ArrayList<Token> token = new ArrayList<Token>(); |
50 try { | 40 try { |
51 Reader reader = new StringReader(inputString); | 41 Reader reader = new StringReader(inputString); |
52 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); | 42 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer); |
53 tmpTokenizer.setRegWithoutSemicolon(regWithoutSemicolon); // hack: feel free to remove it later | |
54 TokenStream result = (TokenStream) tmpTokenizer; | 43 TokenStream result = (TokenStream) tmpTokenizer; |
55 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | 44 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. |
56 result = new LowerCaseFilter(result); | 45 result = new LowerCaseFilter(result); |
57 Token t = result.next(); | 46 Token t = result.next(); |
58 while (t != null) { | 47 while (t != null) { |