comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2
3 import java.io.IOException;
4 import java.io.StringReader;
5 import java.util.ArrayList;
6
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
10
11 /**
12 * own simple implementation of Tokenizer
13 * @author jwillenborg
14 *
15 */
16 public class TokenizerNew {
17 private StringReader input;
18 private String language = "eng"; // default: english
19 private Normalizer normalizer;
20 private ArrayList<Token> tokens;
21
22 public TokenizerNew(StringReader input, String language) {
23 this.input = input;
24 String[] normFunctions = {"norm"};
25 this.language = language;
26 this.normalizer = new Normalizer(normFunctions, language);
27 }
28
29 public ArrayList<Token> tokenize() throws ApplicationException {
30 if (Language.getInstance().isChinese(language))
31 return tokenizeChinese();
32 else
33 return tokenizeAll();
34 }
35
36 private ArrayList<Token> tokenizeAll() throws ApplicationException {
37 tokens = new ArrayList<Token>();
38 try {
39 int cInt = -1;
40 StringBuilder tokenStr = new StringBuilder();
41 int pos = 0;
42 boolean isTokenChar = false; // last state
43 int tokenStart = -1;
44 int tokenEnd = -1;
45 while ((cInt = input.read()) != -1) {
46 char c = (char) cInt;
47 if (isTokenChar(c)) {
48 if (! isTokenChar) {
49 tokenStr = new StringBuilder();
50 tokenStart = pos;
51 }
52 tokenStr.append(c);
53 isTokenChar = true;
54 } else {
55 if (isTokenChar) {
56 tokenEnd = pos - 1;
57 String tStr = tokenStr.toString();
58 addToken(tokenStart, tokenEnd, tStr);
59 }
60 isTokenChar = false;
61 }
62 pos++;
63 }
64 // add last token if last char of input is a token char
65 if (isTokenChar) {
66 tokenEnd = pos - 1;
67 String tStr = tokenStr.toString();
68 addToken(tokenStart, tokenEnd, tStr);
69 }
70 input.close();
71 } catch (IOException e) {
72 throw new ApplicationException(e);
73 }
74 if (tokens.isEmpty())
75 return null;
76 else
77 return tokens;
78 }
79
80 /**
81 * each token character is a single token
82 * @return
83 * @throws ApplicationException
84 */
85 private ArrayList<Token> tokenizeChinese() throws ApplicationException {
86 tokens = new ArrayList<Token>();
87 try {
88 int cInt = -1;
89 int pos = 0;
90 while ((cInt = input.read()) != -1) {
91 char c = (char) cInt;
92 if (isTokenChar(c)) {
93 String tStr = String.valueOf(c);
94 addToken(pos, pos + 1, tStr);
95 }
96 pos++;
97 }
98 input.close();
99 } catch (IOException e) {
100 throw new ApplicationException(e);
101 }
102 if (tokens.isEmpty())
103 return null;
104 else
105 return tokens;
106 }
107
108 private boolean isTokenChar(char c) {
109 boolean isTokenChar = true;
110 switch (c) {
111 case ' ': isTokenChar = false; break;
112 case '.': isTokenChar = false; break;
113 case ',': isTokenChar = false; break;
114 case '!': isTokenChar = false; break;
115 case '?': isTokenChar = false; break;
116 case ';': isTokenChar = false; break;
117 case ':': isTokenChar = false; break;
118 case '(': isTokenChar = false; break;
119 case ')': isTokenChar = false; break;
120 case '[': isTokenChar = false; break;
121 case ']': isTokenChar = false; break;
122 case '{': isTokenChar = false; break;
123 case '}': isTokenChar = false; break;
124 case '<': isTokenChar = false; break;
125 case '>': isTokenChar = false; break;
126 case '/': isTokenChar = false; break;
127 case '=': isTokenChar = false; break;
128 case '&': isTokenChar = false; break;
129 case '+': isTokenChar = false; break;
130 case '#': isTokenChar = false; break;
131 case '"': isTokenChar = false; break;
132 case '„': isTokenChar = false; break;
133 case '“': isTokenChar = false; break;
134 case '«': isTokenChar = false; break;
135 case '»': isTokenChar = false; break;
136 case '\'': isTokenChar = false; break;
137 case '\t': isTokenChar = false; break; // do not break words which have tabs in it
138 case '\n': isTokenChar = false; break; // do not break words which are on another line
139 case '\u2425': isTokenChar = false; break; // special char for marking xml elements
140 }
141 return isTokenChar;
142 }
143
144 private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException {
145 tokenStr = removeElementMarks(tokenStr);
146 tokenStr = removeSpecialSymbols(tokenStr); // e.g. "-" is deleted so that the normalizer works properly
147 String tStrNormed = normalizer.normalize(tokenStr);
148 Token token = new Token(startPos, endPos, tokenStr);
149 token.setContentNorm(tStrNormed);
150 tokens.add(token);
151 }
152
153 private String removeElementMarks(String inputStr) {
154 String retStr = inputStr.replaceAll("\u2424|\u2425", "");
155 return retStr;
156 }
157
158 private String removeSpecialSymbols(String inputStr) {
159 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
160 return retStr;
161 }
162 }