annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 5df60f24e997
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.IOException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.Reader;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 import org.apache.lucene.analysis.Token;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 import org.apache.lucene.analysis.Tokenizer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 public class MpdlTokenizer extends Tokenizer {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 private static final int MAX_WORD_LEN = 255;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 private static final int IO_BUFFER_SIZE = 1024;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
15 private boolean isInNotWordDelimMode = false;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 private int offset = 0, bufferIndex = 0, dataLen = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 private char[] buffer = new char[MAX_WORD_LEN];
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 private MpdlNormalizer normalizer;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
20 private String language;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 public MpdlTokenizer(Reader input, String language) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 super(input);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 this.language = language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 super(input);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 this.language = language;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 this.normalizer = normalizer;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 /** Returns true iff a character should be included in a token. This
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 * tokenizer generates as tokens adjacent sequences of characters which
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 * satisfy this predicate. Characters for which this is false are used to
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 * define token boundaries and are not included in tokens. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 protected boolean isTokenChar(char c) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 boolean isTokenChar = true;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 switch (c) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 case ' ': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 case '.': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 case ',': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 case '!': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 case '?': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 case ';': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 case ':': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 case '(': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 case ')': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 case '[': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 case ']': isTokenChar = false; break;
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
51 case '{': isTokenChar = false; break;
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
52 case '}': isTokenChar = false; break;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 case '<': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 case '>': isTokenChar = false; break;
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
55 case '/': isTokenChar = false; break;
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
56 case '=': isTokenChar = false; break;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 case '&': isTokenChar = false; break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 case '+': isTokenChar = false; break;
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
59 case '#': isTokenChar = false; break;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 case '"': isTokenChar = false; break;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
61 case '„': isTokenChar = false; break;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
62 case '“': isTokenChar = false; break;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
63 case '«': isTokenChar = false; break;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
64 case '»': isTokenChar = false; break;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 case '\'': isTokenChar = false; break;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
66 case '\t': isTokenChar = false; break; // do not break words which have tabs in it
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
67 case '\n': isTokenChar = false; break; // do not break words which are on another line
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
68 case '\u2425': isTokenChar = false; break; // special char for marking xml elements
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 return isTokenChar;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
72
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
73 protected boolean isTokenCharInNotWordDelimMode(char c) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
74 boolean isTokenCharInNotWordDelimMode = false;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
75 if (isInNotWordDelimMode) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
76 switch (c) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
77 case ' ': isTokenCharInNotWordDelimMode = true; break;
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
78 case '-': isTokenCharInNotWordDelimMode = true; break;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
79 case '\t': isTokenCharInNotWordDelimMode = true; break;
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
80 case '\n': isTokenCharInNotWordDelimMode = true; break;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
81 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
82 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
83 return isTokenCharInNotWordDelimMode;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
84 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
85
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
86 protected boolean isSpecialNotWordDelimSymbol(char c) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
87 boolean isSpecialNotWordDelimSymbol = false;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
88 switch (c) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
89 case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
90 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
91 return isSpecialNotWordDelimSymbol;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
92 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
93
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 /** Called on each token character to normalize it before it is added to the
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 * token. The default implementation does nothing. Subclasses may use this
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 * to, e.g., lowercase tokens. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 protected char normalize(char c) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 return c;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 /** Returns the next token in the stream, or null at EOS. */
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 public final Token next() throws IOException {
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
104 if (language != null && language.equals("zh"))
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
105 return nextChinese();
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 int length = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107 int start = offset;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 while (true) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 final char c;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 offset++;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111 if (bufferIndex >= dataLen) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 dataLen = input.read(ioBuffer);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 bufferIndex = 0;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 if (dataLen == -1) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 if (length > 0)
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 else
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 return null;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 } else {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121 c = ioBuffer[bufferIndex++];
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
122 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
123 if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
124 isInNotWordDelimMode = false;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
125 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
126 if (isSpecialNotWordDelimSymbol(c)) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
127 isInNotWordDelimMode = true;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
128 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
129 if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 if (length == 0) // start of token
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 start = offset - 1;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 buffer[length++] = normalize(c); // buffer it, normalized
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 if (length == MAX_WORD_LEN) // buffer overflow!
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 break;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 } else if (length > 0) // at non-Letter w/ chars
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 break; // return 'em
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
138 isInNotWordDelimMode = false;
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 Token newToken = new Token(start, start + length);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 newToken.setTermBuffer(buffer, 0, length);
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
141 removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements)
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 if (normalizer != null) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143 char[] termBuffer = newToken.termBuffer();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 int termBufferLength = newToken.termLength();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
145 String tokenText = new String(termBuffer, 0, termBufferLength);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 try {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 String normalizedTokenText = normalizer.normalize(tokenText);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
148 int normalizedTokenTextLength = normalizedTokenText.length();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray();
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151 } catch (ApplicationException e) {
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152 throw new IOException(e);
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154 }
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155 return newToken;
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 }
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
157
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
158 private Token removeSpecialSymbols(Token token) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
159 char[] termBuffer = token.termBuffer();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
160 int termBufferLength = token.termLength();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
161 String tokenText = new String(termBuffer, 0, termBufferLength);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
162 String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
163 int newTokenTextLength = newTokenText.length();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
164 char[] newTokenTextBuffer = newTokenText.toCharArray();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
165 token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
166 return token;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
167 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
168
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
169
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
170
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
171 /*
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
172 * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
173 *
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
174 */
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
175 private int length;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
176 private int start;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
177
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
178 private final void push(char c) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
179 if (length == 0) start = offset-1; // start of token
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
180 buffer[length++] = Character.toLowerCase(c); // buffer it
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
181 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
182
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
183 private final Token flush() {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
184 if (length>0) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
185 return new Token(new String(buffer, 0, length), start, start+length);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
186 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
187 else
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
188 return null;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
189 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
190
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
191 public final Token nextChinese() throws IOException {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
192 length = 0;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
193 start = offset;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
194 while (true) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
195 final char c;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
196 offset++;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
197 if (bufferIndex >= dataLen) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
198 dataLen = input.read(ioBuffer);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
199 bufferIndex = 0;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
200 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
201 if (dataLen == -1)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
202 return flush();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
203 else
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
204 c = ioBuffer[bufferIndex++];
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
205 int charType = Character.getType(c);
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
206 switch(charType) {
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
207 case Character.DECIMAL_DIGIT_NUMBER:
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
208 case Character.LOWERCASE_LETTER:
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
209 case Character.UPPERCASE_LETTER:
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
210 push(c);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
211 if (length == MAX_WORD_LEN)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
212 return flush();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
213 break;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
214 case Character.OTHER_LETTER:
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
215 if (length>0) {
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
216 bufferIndex--;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
217 offset--;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
218 return flush();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
219 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
220 push(c);
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
221 return flush();
14
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
222 case Character.SURROGATE: // neu eingefügt: Lösung von Ticket 121/117: Erkennung von Codepoints über FFFF
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
223 push(c);
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
224 if (length == MAX_WORD_LEN)
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
225 return flush();
5df60f24e997 diverse Fehlerbehebungen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 6
diff changeset
226 break;
6
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
227 default:
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
228 if (length>0)
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
229 return flush();
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
230 break;
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
231 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
232 }
2396a569e446 new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 0
diff changeset
233 }
0
408254cf2f1d Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
234 }