Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.StringReader; | |
5 import java.util.ArrayList; | |
6 | |
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | |
10 | |
11 /** | |
12 * own simple implementation of Tokenizer | |
13 * @author jwillenborg | |
14 * | |
15 */ | |
16 public class TokenizerNew { | |
17 private StringReader input; | |
18 private String language = "eng"; // default: english | |
19 private Normalizer normalizer; | |
20 private ArrayList<Token> tokens; | |
21 | |
22 public TokenizerNew(StringReader input, String language) { | |
23 this.input = input; | |
24 String[] normFunctions = {"norm"}; | |
25 this.language = language; | |
26 this.normalizer = new Normalizer(normFunctions, language); | |
27 } | |
28 | |
29 public ArrayList<Token> tokenize() throws ApplicationException { | |
30 if (Language.getInstance().isChinese(language)) | |
31 return tokenizeChinese(); | |
32 else | |
33 return tokenizeAll(); | |
34 } | |
35 | |
36 private ArrayList<Token> tokenizeAll() throws ApplicationException { | |
37 tokens = new ArrayList<Token>(); | |
38 try { | |
39 int cInt = -1; | |
40 StringBuilder tokenStr = new StringBuilder(); | |
41 int pos = 0; | |
42 boolean isTokenChar = false; // last state | |
43 int tokenStart = -1; | |
44 int tokenEnd = -1; | |
45 while ((cInt = input.read()) != -1) { | |
46 char c = (char) cInt; | |
47 if (isTokenChar(c)) { | |
48 if (! isTokenChar) { | |
49 tokenStr = new StringBuilder(); | |
50 tokenStart = pos; | |
51 } | |
52 tokenStr.append(c); | |
53 isTokenChar = true; | |
54 } else { | |
55 if (isTokenChar) { | |
56 tokenEnd = pos - 1; | |
57 String tStr = tokenStr.toString(); | |
58 addToken(tokenStart, tokenEnd, tStr); | |
59 } | |
60 isTokenChar = false; | |
61 } | |
62 pos++; | |
63 } | |
64 // add last token if last char of input is a token char | |
65 if (isTokenChar) { | |
66 tokenEnd = pos - 1; | |
67 String tStr = tokenStr.toString(); | |
68 addToken(tokenStart, tokenEnd, tStr); | |
69 } | |
70 input.close(); | |
71 } catch (IOException e) { | |
72 throw new ApplicationException(e); | |
73 } | |
74 if (tokens.isEmpty()) | |
75 return null; | |
76 else | |
77 return tokens; | |
78 } | |
79 | |
80 /** | |
81 * each token character is a single token | |
82 * @return | |
83 * @throws ApplicationException | |
84 */ | |
85 private ArrayList<Token> tokenizeChinese() throws ApplicationException { | |
86 tokens = new ArrayList<Token>(); | |
87 try { | |
88 int cInt = -1; | |
89 int pos = 0; | |
90 while ((cInt = input.read()) != -1) { | |
91 char c = (char) cInt; | |
92 if (isTokenChar(c)) { | |
93 String tStr = String.valueOf(c); | |
94 addToken(pos, pos + 1, tStr); | |
95 } | |
96 pos++; | |
97 } | |
98 input.close(); | |
99 } catch (IOException e) { | |
100 throw new ApplicationException(e); | |
101 } | |
102 if (tokens.isEmpty()) | |
103 return null; | |
104 else | |
105 return tokens; | |
106 } | |
107 | |
108 private boolean isTokenChar(char c) { | |
109 boolean isTokenChar = true; | |
110 switch (c) { | |
111 case ' ': isTokenChar = false; break; | |
112 case '.': isTokenChar = false; break; | |
113 case ',': isTokenChar = false; break; | |
114 case '!': isTokenChar = false; break; | |
115 case '?': isTokenChar = false; break; | |
116 case ';': isTokenChar = false; break; | |
117 case ':': isTokenChar = false; break; | |
118 case '(': isTokenChar = false; break; | |
119 case ')': isTokenChar = false; break; | |
120 case '[': isTokenChar = false; break; | |
121 case ']': isTokenChar = false; break; | |
122 case '{': isTokenChar = false; break; | |
123 case '}': isTokenChar = false; break; | |
124 case '<': isTokenChar = false; break; | |
125 case '>': isTokenChar = false; break; | |
126 case '/': isTokenChar = false; break; | |
127 case '=': isTokenChar = false; break; | |
128 case '&': isTokenChar = false; break; | |
129 case '+': isTokenChar = false; break; | |
130 case '#': isTokenChar = false; break; | |
131 case '"': isTokenChar = false; break; | |
132 case '„': isTokenChar = false; break; | |
133 case '“': isTokenChar = false; break; | |
134 case '«': isTokenChar = false; break; | |
135 case '»': isTokenChar = false; break; | |
136 case '\'': isTokenChar = false; break; | |
137 case '\t': isTokenChar = false; break; // do not break words which have tabs in it | |
138 case '\n': isTokenChar = false; break; // do not break words which are on another line | |
139 case '\u2425': isTokenChar = false; break; // special char for marking xml elements | |
140 } | |
141 return isTokenChar; | |
142 } | |
143 | |
144 private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException { | |
145 tokenStr = removeElementMarks(tokenStr); | |
146 tokenStr = removeSpecialSymbols(tokenStr); // e.g. "-" is deleted so that the normalizer works properly | |
147 String tStrNormed = normalizer.normalize(tokenStr); | |
148 Token token = new Token(startPos, endPos, tokenStr); | |
149 token.setContentNorm(tStrNormed); | |
150 tokens.add(token); | |
151 } | |
152 | |
153 private String removeElementMarks(String inputStr) { | |
154 String retStr = inputStr.replaceAll("\u2424|\u2425", ""); | |
155 return retStr; | |
156 } | |
157 | |
158 private String removeSpecialSymbols(String inputStr) { | |
159 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen | |
160 return retStr; | |
161 } | |
162 } |