Mercurial > hg > mpdl-group
annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 16:257f67be5c00
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Sep 2011 16:40:57 +0200 |
parents | 5df60f24e997 |
children |
rev | line source |
---|---|
0 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; |
2 | |
3 import java.io.IOException; | |
4 import java.io.Reader; | |
5 | |
6 import org.apache.lucene.analysis.Token; | |
7 import org.apache.lucene.analysis.Tokenizer; | |
8 | |
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
10 | |
11 public class MpdlTokenizer extends Tokenizer { | |
12 private static final int MAX_WORD_LEN = 255; | |
13 private static final int IO_BUFFER_SIZE = 1024; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
15 private boolean isInNotWordDelimMode = false; |
0 | 16 private int offset = 0, bufferIndex = 0, dataLen = 0; |
17 private char[] buffer = new char[MAX_WORD_LEN]; | |
18 private char[] ioBuffer = new char[IO_BUFFER_SIZE]; | |
19 private MpdlNormalizer normalizer; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
20 private String language; |
0 | 21 |
22 public MpdlTokenizer(Reader input, String language) { | |
23 super(input); | |
24 this.language = language; | |
25 } | |
26 | |
27 public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) { | |
28 super(input); | |
29 this.language = language; | |
30 this.normalizer = normalizer; | |
31 } | |
32 | |
33 /** Returns true iff a character should be included in a token. This | |
34 * tokenizer generates as tokens adjacent sequences of characters which | |
35 * satisfy this predicate. Characters for which this is false are used to | |
36 * define token boundaries and are not included in tokens. */ | |
37 protected boolean isTokenChar(char c) { | |
38 boolean isTokenChar = true; | |
39 switch (c) { | |
40 case ' ': isTokenChar = false; break; | |
41 case '.': isTokenChar = false; break; | |
42 case ',': isTokenChar = false; break; | |
43 case '!': isTokenChar = false; break; | |
44 case '?': isTokenChar = false; break; | |
45 case ';': isTokenChar = false; break; | |
46 case ':': isTokenChar = false; break; | |
47 case '(': isTokenChar = false; break; | |
48 case ')': isTokenChar = false; break; | |
49 case '[': isTokenChar = false; break; | |
50 case ']': isTokenChar = false; break; | |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
51 case '{': isTokenChar = false; break; |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
52 case '}': isTokenChar = false; break; |
0 | 53 case '<': isTokenChar = false; break; |
54 case '>': isTokenChar = false; break; | |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
55 case '/': isTokenChar = false; break; |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
56 case '=': isTokenChar = false; break; |
0 | 57 case '&': isTokenChar = false; break; |
58 case '+': isTokenChar = false; break; | |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
59 case '#': isTokenChar = false; break; |
0 | 60 case '"': isTokenChar = false; break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
61 case '„': isTokenChar = false; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
62 case '“': isTokenChar = false; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
63 case '«': isTokenChar = false; break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
64 case '»': isTokenChar = false; break; |
0 | 65 case '\'': isTokenChar = false; break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
66 case '\t': isTokenChar = false; break; // do not break words which have tabs in it |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
67 case '\n': isTokenChar = false; break; // do not break words which are on another line |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
68 case '\u2425': isTokenChar = false; break; // special char for marking xml elements |
0 | 69 } |
70 return isTokenChar; | |
71 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
72 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
73 protected boolean isTokenCharInNotWordDelimMode(char c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
74 boolean isTokenCharInNotWordDelimMode = false; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
75 if (isInNotWordDelimMode) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
76 switch (c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
77 case ' ': isTokenCharInNotWordDelimMode = true; break; |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
78 case '-': isTokenCharInNotWordDelimMode = true; break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
79 case '\t': isTokenCharInNotWordDelimMode = true; break; |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
80 case '\n': isTokenCharInNotWordDelimMode = true; break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
81 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
82 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
83 return isTokenCharInNotWordDelimMode; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
84 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
85 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
86 protected boolean isSpecialNotWordDelimSymbol(char c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
87 boolean isSpecialNotWordDelimSymbol = false; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
88 switch (c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
89 case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
90 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
91 return isSpecialNotWordDelimSymbol; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
92 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
93 |
0 | 94 |
95 /** Called on each token character to normalize it before it is added to the | |
96 * token. The default implementation does nothing. Subclasses may use this | |
97 * to, e.g., lowercase tokens. */ | |
98 protected char normalize(char c) { | |
99 return c; | |
100 } | |
101 | |
102 /** Returns the next token in the stream, or null at EOS. */ | |
103 public final Token next() throws IOException { | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
104 if (language != null && language.equals("zh")) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
105 return nextChinese(); |
0 | 106 int length = 0; |
107 int start = offset; | |
108 while (true) { | |
109 final char c; | |
110 offset++; | |
111 if (bufferIndex >= dataLen) { | |
112 dataLen = input.read(ioBuffer); | |
113 bufferIndex = 0; | |
114 } | |
115 if (dataLen == -1) { | |
116 if (length > 0) | |
117 break; | |
118 else | |
119 return null; | |
120 } else { | |
121 c = ioBuffer[bufferIndex++]; | |
122 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
123 if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
124 isInNotWordDelimMode = false; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
125 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
126 if (isSpecialNotWordDelimSymbol(c)) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
127 isInNotWordDelimMode = true; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
128 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
129 if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char |
0 | 130 if (length == 0) // start of token |
131 start = offset - 1; | |
132 buffer[length++] = normalize(c); // buffer it, normalized | |
133 if (length == MAX_WORD_LEN) // buffer overflow! | |
134 break; | |
135 } else if (length > 0) // at non-Letter w/ chars | |
136 break; // return 'em | |
137 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
138 isInNotWordDelimMode = false; |
0 | 139 Token newToken = new Token(start, start + length); |
140 newToken.setTermBuffer(buffer, 0, length); | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
141 removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements) |
0 | 142 if (normalizer != null) { |
143 char[] termBuffer = newToken.termBuffer(); | |
144 int termBufferLength = newToken.termLength(); | |
145 String tokenText = new String(termBuffer, 0, termBufferLength); | |
146 try { | |
147 String normalizedTokenText = normalizer.normalize(tokenText); | |
148 int normalizedTokenTextLength = normalizedTokenText.length(); | |
149 char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray(); | |
150 newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength); | |
151 } catch (ApplicationException e) { | |
152 throw new IOException(e); | |
153 } | |
154 } | |
155 return newToken; | |
156 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
157 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
158 private Token removeSpecialSymbols(Token token) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
159 char[] termBuffer = token.termBuffer(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
160 int termBufferLength = token.termLength(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
161 String tokenText = new String(termBuffer, 0, termBufferLength); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
162 String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
163 int newTokenTextLength = newTokenText.length(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
164 char[] newTokenTextBuffer = newTokenText.toCharArray(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
165 token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
166 return token; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
167 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
168 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
169 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
170 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
171 /* |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
172 * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
173 * |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
174 */ |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
175 private int length; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
176 private int start; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
177 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
178 private final void push(char c) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
179 if (length == 0) start = offset-1; // start of token |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
180 buffer[length++] = Character.toLowerCase(c); // buffer it |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
181 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
182 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
183 private final Token flush() { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
184 if (length>0) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
185 return new Token(new String(buffer, 0, length), start, start+length); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
186 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
187 else |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
188 return null; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
189 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
190 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
191 public final Token nextChinese() throws IOException { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
192 length = 0; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
193 start = offset; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
194 while (true) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
195 final char c; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
196 offset++; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
197 if (bufferIndex >= dataLen) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
198 dataLen = input.read(ioBuffer); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
199 bufferIndex = 0; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
200 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
201 if (dataLen == -1) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
202 return flush(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
203 else |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
204 c = ioBuffer[bufferIndex++]; |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
205 int charType = Character.getType(c); |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
206 switch(charType) { |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
207 case Character.DECIMAL_DIGIT_NUMBER: |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
208 case Character.LOWERCASE_LETTER: |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
209 case Character.UPPERCASE_LETTER: |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
210 push(c); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
211 if (length == MAX_WORD_LEN) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
212 return flush(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
213 break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
214 case Character.OTHER_LETTER: |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
215 if (length>0) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
216 bufferIndex--; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
217 offset--; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
218 return flush(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
219 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
220 push(c); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
221 return flush(); |
14
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
222 case Character.SURROGATE: // neu eingefügt: Lösung von Ticket 121/117: Erkennung von Codepoints über FFFF |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
223 push(c); |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
224 if (length == MAX_WORD_LEN) |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
225 return flush(); |
5df60f24e997
diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
226 break; |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
227 default: |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
228 if (length>0) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
229 return flush(); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
230 break; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
231 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
232 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
233 } |
0 | 234 } |