annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizerAnalyzer.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 2396a569e446
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.IOException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.Reader;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import java.io.StringReader;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import java.util.ArrayList;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import org.apache.lucene.analysis.Analyzer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import org.apache.lucene.analysis.LowerCaseFilter;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import org.apache.lucene.analysis.Token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 import org.apache.lucene.analysis.TokenStream;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 import de.mpg.mpiwg.berlin.mpdl.general.MpdlConstants;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 public class MpdlTokenizerAnalyzer extends Analyzer {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 protected String language = MpdlConstants.DEFAULT_LANGUAGE;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 protected MpdlNormalizer normalizer = null;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 public MpdlTokenizerAnalyzer(String language) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 this.language = language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 this.normalizer = new MpdlNormalizer(language); // default normalizer
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 public MpdlTokenizerAnalyzer(MpdlNormalizer normalizer, String language) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 this.language = language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 this.normalizer = normalizer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 public TokenStream tokenStream(String fieldName, Reader reader) {
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
31 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
32 TokenStream result = (TokenStream) tmpTokenizer;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 result = new LowerCaseFilter(result);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 return result;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 public ArrayList<Token> getToken(String inputString) throws ApplicationException {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 ArrayList<Token> token = new ArrayList<Token>();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 Reader reader = new StringReader(inputString);
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
42 MpdlTokenizer tmpTokenizer = new MpdlTokenizer(reader, language, normalizer);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
43 TokenStream result = (TokenStream) tmpTokenizer;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 result = new MpdlFilter(result); // filter to remove the hyphen in a token etc.
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 result = new LowerCaseFilter(result);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 Token t = result.next();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 while (t != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 token.add(t);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 t = result.next();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 } catch (IOException e) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 throw new ApplicationException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 return token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 }