annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.IOException;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.StringReader;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 import java.util.ArrayList;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 /**
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 * own simple implementation of Tokenizer
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 * @author jwillenborg
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 *
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16 public class TokenizerNew {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17 private StringReader input;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 private String language = "eng"; // default: english
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19 private Normalizer normalizer;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 private ArrayList<Token> tokens;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 public TokenizerNew(StringReader input, String language) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 this.input = input;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 String[] normFunctions = {"norm"};
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 this.language = language;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 this.normalizer = new Normalizer(normFunctions, language);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 public ArrayList<Token> tokenize() throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 if (Language.getInstance().isChinese(language))
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 return tokenizeChinese();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 else
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 return tokenizeAll();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 private ArrayList<Token> tokenizeAll() throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 tokens = new ArrayList<Token>();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 try {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 int cInt = -1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 StringBuilder tokenStr = new StringBuilder();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 int pos = 0;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 boolean isTokenChar = false; // last state
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 int tokenStart = -1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 int tokenEnd = -1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 while ((cInt = input.read()) != -1) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 char c = (char) cInt;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 if (isTokenChar(c)) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 if (! isTokenChar) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 tokenStr = new StringBuilder();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 tokenStart = pos;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 tokenStr.append(c);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 isTokenChar = true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 } else {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 if (isTokenChar) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 tokenEnd = pos - 1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 String tStr = tokenStr.toString();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 addToken(tokenStart, tokenEnd, tStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 isTokenChar = false;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 pos++;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 // add last token if last char of input is a token char
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 if (isTokenChar) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 tokenEnd = pos - 1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 String tStr = tokenStr.toString();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68 addToken(tokenStart, tokenEnd, tStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 input.close();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71 } catch (IOException e) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72 throw new ApplicationException(e);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 if (tokens.isEmpty())
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 return null;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 else
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 return tokens;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 /**
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81 * each token character is a single token
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 * @return
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 * @throws ApplicationException
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 private ArrayList<Token> tokenizeChinese() throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 tokens = new ArrayList<Token>();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 try {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 int cInt = -1;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 int pos = 0;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 while ((cInt = input.read()) != -1) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91 char c = (char) cInt;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 if (isTokenChar(c)) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 String tStr = String.valueOf(c);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 addToken(pos, pos + 1, tStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 pos++;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98 input.close();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 } catch (IOException e) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100 throw new ApplicationException(e);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 if (tokens.isEmpty())
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103 return null;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
104 else
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
105 return tokens;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 private boolean isTokenChar(char c) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 boolean isTokenChar = true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 switch (c) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111 case ' ': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 case '.': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 case ',': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 case '!': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 case '?': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 case ';': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 case ':': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 case '(': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 case ')': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 case '[': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121 case ']': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
122 case '{': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
123 case '}': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
124 case '<': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
125 case '>': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
126 case '/': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
127 case '=': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
128 case '&': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
129 case '+': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
130 case '#': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
131 case '"': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
132 case '„': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
133 case '“': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
134 case '«': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
135 case '»': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
136 case '\'': isTokenChar = false; break;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
137 case '\t': isTokenChar = false; break; // do not break words which have tabs in it
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
138 case '\n': isTokenChar = false; break; // do not break words which are on another line
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
139 case '\u2425': isTokenChar = false; break; // special char for marking xml elements
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
140 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
141 return isTokenChar;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
142 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
143
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
144 private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
145 tokenStr = removeElementMarks(tokenStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
146 tokenStr = removeSpecialSymbols(tokenStr); // e.g. "-" is deleted so that the normalizer works properly
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
147 String tStrNormed = normalizer.normalize(tokenStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
148 Token token = new Token(startPos, endPos, tokenStr);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
149 token.setContentNorm(tStrNormed);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
150 tokens.add(token);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
151 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
152
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
153 private String removeElementMarks(String inputStr) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
154 String retStr = inputStr.replaceAll("\u2424|\u2425", "");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
155 return retStr;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
156 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
157
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
158 private String removeSpecialSymbols(String inputStr) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
159 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
160 return retStr;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
161 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
162 }