Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.Reader; | |
5 | |
6 import org.apache.lucene.analysis.Token; | |
7 import org.apache.lucene.analysis.Tokenizer; | |
8 | |
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
10 | |
11 public class MpdlTokenizer extends Tokenizer { | |
12 private static final int MAX_WORD_LEN = 255; | |
13 private static final int IO_BUFFER_SIZE = 1024; | |
14 private String language; // TODO make the tokenizer language dependent | |
15 private int offset = 0, bufferIndex = 0, dataLen = 0; | |
16 private char[] buffer = new char[MAX_WORD_LEN]; | |
17 private char[] ioBuffer = new char[IO_BUFFER_SIZE]; | |
18 private MpdlNormalizer normalizer; | |
19 | |
20 public MpdlTokenizer(Reader input, String language) { | |
21 super(input); | |
22 this.language = language; | |
23 } | |
24 | |
25 public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) { | |
26 super(input); | |
27 this.language = language; | |
28 this.normalizer = normalizer; | |
29 } | |
30 | |
31 /** Returns true iff a character should be included in a token. This | |
32 * tokenizer generates as tokens adjacent sequences of characters which | |
33 * satisfy this predicate. Characters for which this is false are used to | |
34 * define token boundaries and are not included in tokens. */ | |
35 protected boolean isTokenChar(char c) { | |
36 boolean isTokenChar = true; | |
37 switch (c) { | |
38 case ' ': isTokenChar = false; break; | |
39 case '.': isTokenChar = false; break; | |
40 case ',': isTokenChar = false; break; | |
41 case '!': isTokenChar = false; break; | |
42 case '?': isTokenChar = false; break; | |
43 case ';': isTokenChar = false; break; | |
44 case ':': isTokenChar = false; break; | |
45 case '(': isTokenChar = false; break; | |
46 case ')': isTokenChar = false; break; | |
47 case '[': isTokenChar = false; break; | |
48 case ']': isTokenChar = false; break; | |
49 case '<': isTokenChar = false; break; | |
50 case '>': isTokenChar = false; break; | |
51 case '&': isTokenChar = false; break; | |
52 case '+': isTokenChar = false; break; | |
53 case '"': isTokenChar = false; break; | |
54 case '\'': isTokenChar = false; break; | |
55 // case '\t': isTokenChar = false; break; | |
56 // case '\n': isTokenChar = false; break; // do not break words which are on another line | |
57 } | |
58 return isTokenChar; | |
59 } | |
60 | |
61 /** Called on each token character to normalize it before it is added to the | |
62 * token. The default implementation does nothing. Subclasses may use this | |
63 * to, e.g., lowercase tokens. */ | |
64 protected char normalize(char c) { | |
65 return c; | |
66 } | |
67 | |
68 /** Returns the next token in the stream, or null at EOS. */ | |
69 public final Token next() throws IOException { | |
70 int length = 0; | |
71 int start = offset; | |
72 while (true) { | |
73 final char c; | |
74 offset++; | |
75 if (bufferIndex >= dataLen) { | |
76 dataLen = input.read(ioBuffer); | |
77 bufferIndex = 0; | |
78 } | |
79 if (dataLen == -1) { | |
80 if (length > 0) | |
81 break; | |
82 else | |
83 return null; | |
84 } else { | |
85 c = ioBuffer[bufferIndex++]; | |
86 } | |
87 if (isTokenChar(c)) { // if it's a token char | |
88 if (length == 0) // start of token | |
89 start = offset - 1; | |
90 buffer[length++] = normalize(c); // buffer it, normalized | |
91 if (length == MAX_WORD_LEN) // buffer overflow! | |
92 break; | |
93 } else if (length > 0) // at non-Letter w/ chars | |
94 break; // return 'em | |
95 } | |
96 Token newToken = new Token(start, start + length); | |
97 newToken.setTermBuffer(buffer, 0, length); | |
98 if (normalizer != null) { | |
99 char[] termBuffer = newToken.termBuffer(); | |
100 int termBufferLength = newToken.termLength(); | |
101 String tokenText = new String(termBuffer, 0, termBufferLength); | |
102 try { | |
103 String normalizedTokenText = normalizer.normalize(tokenText); | |
104 int normalizedTokenTextLength = normalizedTokenText.length(); | |
105 char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray(); | |
106 newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength); | |
107 } catch (ApplicationException e) { | |
108 throw new IOException(e); | |
109 } | |
110 } | |
111 return newToken; | |
112 } | |
113 } |