Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.Reader; | |
5 import java.io.StringReader; | |
6 import java.util.ArrayList; | |
7 | |
8 import org.apache.lucene.analysis.Analyzer; | |
9 import org.apache.lucene.analysis.LowerCaseFilter; | |
10 import org.apache.lucene.analysis.Token; | |
11 import org.apache.lucene.analysis.TokenStream; | |
12 | |
13 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants; | |
15 | |
16 public class MpdlTokenizerAnalyzer extends Analyzer { | |
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE; | |
18 protected MpdlNormalizer normalizer = null; | |
19 | |
20 public MpdlTokenizerAnalyzer(String language) { | |
21 this.language = language; | |
22 this.normalizer = new MpdlNormalizer(language); // default normalizer | |
23 } | |
24 | |
25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) { | |
26 this.language = language; | |
27 this.normalizer = normalizer; | |
28 } | |
29 | |
30 public TokenStream tokenStream(String fieldName, Reader reader) { | |
31 TokenStream result = new MpdlTokenizer(reader, language, normalizer); | |
32 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | |
33 result = new LowerCaseFilter(result); | |
34 return result; | |
35 } | |
36 | |
37 public ArrayList<Token> getToken(String inputString) throws ApplicationException { | |
38 ArrayList<Token> token = new ArrayList<Token>(); | |
39 try { | |
40 Reader reader = new StringReader(inputString); | |
41 TokenStream result = new MpdlTokenizer(reader, language, normalizer); | |
42 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc. | |
43 result = new LowerCaseFilter(result); | |
44 Token t = result.next(); | |
45 while (t != null) { | |
46 token.add(t); | |
47 t = result.next(); | |
48 } | |
49 } catch (IOException e) { | |
50 throw new ApplicationException(e); | |
51 } | |
52 return token; | |
53 } | |
54 | |
55 } |