comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
2
3 import java.io.IOException;
4 import java.io.Reader;
5
6 import org.apache.lucene.analysis.Token;
7 import org.apache.lucene.analysis.Tokenizer;
8
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
10
11 public class MpdlTokenizer extends Tokenizer {
12 private static final int MAX_WORD_LEN = 255;
13 private static final int IO_BUFFER_SIZE = 1024;
14 private String language; // TODO make the tokenizer language dependent
15 private int offset = 0, bufferIndex = 0, dataLen = 0;
16 private char[] buffer = new char[MAX_WORD_LEN];
17 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
18 private MpdlNormalizer normalizer;
19
20 public MpdlTokenizer(Reader input, String language) {
21 super(input);
22 this.language = language;
23 }
24
25 public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
26 super(input);
27 this.language = language;
28 this.normalizer = normalizer;
29 }
30
31 /** Returns true iff a character should be included in a token. This
32 * tokenizer generates as tokens adjacent sequences of characters which
33 * satisfy this predicate. Characters for which this is false are used to
34 * define token boundaries and are not included in tokens. */
35 protected boolean isTokenChar(char c) {
36 boolean isTokenChar = true;
37 switch (c) {
38 case ' ': isTokenChar = false; break;
39 case '.': isTokenChar = false; break;
40 case ',': isTokenChar = false; break;
41 case '!': isTokenChar = false; break;
42 case '?': isTokenChar = false; break;
43 case ';': isTokenChar = false; break;
44 case ':': isTokenChar = false; break;
45 case '(': isTokenChar = false; break;
46 case ')': isTokenChar = false; break;
47 case '[': isTokenChar = false; break;
48 case ']': isTokenChar = false; break;
49 case '<': isTokenChar = false; break;
50 case '>': isTokenChar = false; break;
51 case '&': isTokenChar = false; break;
52 case '+': isTokenChar = false; break;
53 case '"': isTokenChar = false; break;
54 case '\'': isTokenChar = false; break;
55 // case '\t': isTokenChar = false; break;
56 // case '\n': isTokenChar = false; break; // do not break words which are on another line
57 }
58 return isTokenChar;
59 }
60
61 /** Called on each token character to normalize it before it is added to the
62 * token. The default implementation does nothing. Subclasses may use this
63 * to, e.g., lowercase tokens. */
64 protected char normalize(char c) {
65 return c;
66 }
67
68 /** Returns the next token in the stream, or null at EOS. */
69 public final Token next() throws IOException {
70 int length = 0;
71 int start = offset;
72 while (true) {
73 final char c;
74 offset++;
75 if (bufferIndex >= dataLen) {
76 dataLen = input.read(ioBuffer);
77 bufferIndex = 0;
78 }
79 if (dataLen == -1) {
80 if (length > 0)
81 break;
82 else
83 return null;
84 } else {
85 c = ioBuffer[bufferIndex++];
86 }
87 if (isTokenChar(c)) { // if it's a token char
88 if (length == 0) // start of token
89 start = offset - 1;
90 buffer[length++] = normalize(c); // buffer it, normalized
91 if (length == MAX_WORD_LEN) // buffer overflow!
92 break;
93 } else if (length > 0) // at non-Letter w/ chars
94 break; // return 'em
95 }
96 Token newToken = new Token(start, start + length);
97 newToken.setTermBuffer(buffer, 0, length);
98 if (normalizer != null) {
99 char[] termBuffer = newToken.termBuffer();
100 int termBufferLength = newToken.termLength();
101 String tokenText = new String(termBuffer, 0, termBufferLength);
102 try {
103 String normalizedTokenText = normalizer.normalize(tokenText);
104 int normalizedTokenTextLength = normalizedTokenText.length();
105 char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
106 newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
107 } catch (ApplicationException e) {
108 throw new IOException(e);
109 }
110 }
111 return newToken;
112 }
113 }