Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; /** * own simple implementation of Tokenizer * @author jwillenborg * */ public class TokenizerNew { private StringReader input; private String language = "eng"; // default: english private Normalizer normalizer; private ArrayList<Token> tokens; public TokenizerNew(StringReader input, String language) { this.input = input; String[] normFunctions = {"norm"}; this.language = language; this.normalizer = new Normalizer(normFunctions, language); } public ArrayList<Token> tokenize() throws ApplicationException { if (Language.getInstance().isChinese(language)) return tokenizeChinese(); else return tokenizeAll(); } private ArrayList<Token> tokenizeAll() throws ApplicationException { tokens = new ArrayList<Token>(); try { int cInt = -1; StringBuilder tokenStr = new StringBuilder(); int pos = 0; boolean isTokenChar = false; // last state int tokenStart = -1; int tokenEnd = -1; while ((cInt = input.read()) != -1) { char c = (char) cInt; if (isTokenChar(c)) { if (! isTokenChar) { tokenStr = new StringBuilder(); tokenStart = pos; } tokenStr.append(c); isTokenChar = true; } else { if (isTokenChar) { tokenEnd = pos - 1; String tStr = tokenStr.toString(); addToken(tokenStart, tokenEnd, tStr); } isTokenChar = false; } pos++; } // add last token if last char of input is a token char if (isTokenChar) { tokenEnd = pos - 1; String tStr = tokenStr.toString(); addToken(tokenStart, tokenEnd, tStr); } input.close(); } catch (IOException e) { throw new ApplicationException(e); } if (tokens.isEmpty()) return null; else return tokens; } /** * each token character is a single token * @return * @throws ApplicationException */ private ArrayList<Token> tokenizeChinese() throws ApplicationException { tokens = new ArrayList<Token>(); try { int cInt = -1; int pos = 0; while ((cInt = input.read()) != -1) { char c = (char) cInt; if (isTokenChar(c)) { String tStr = String.valueOf(c); addToken(pos, pos + 1, tStr); } pos++; } input.close(); } catch (IOException e) { throw new ApplicationException(e); } if (tokens.isEmpty()) return null; else return tokens; } private boolean isTokenChar(char c) { boolean isTokenChar = true; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; case ',': isTokenChar = false; break; case '!': isTokenChar = false; break; case '?': isTokenChar = false; break; case ';': isTokenChar = false; break; case ':': isTokenChar = false; break; case '(': isTokenChar = false; break; case ')': isTokenChar = false; break; case '[': isTokenChar = false; break; case ']': isTokenChar = false; break; case '{': isTokenChar = false; break; case '}': isTokenChar = false; break; case '<': isTokenChar = false; break; case '>': isTokenChar = false; break; case '/': isTokenChar = false; break; case '=': isTokenChar = false; break; case '&': isTokenChar = false; break; case '+': isTokenChar = false; break; case '#': isTokenChar = false; break; case '"': isTokenChar = false; break; case '„': isTokenChar = false; break; case '“': isTokenChar = false; break; case '«': isTokenChar = false; break; case '»': isTokenChar = false; break; case '\'': isTokenChar = false; break; case '\t': isTokenChar = false; break; // do not break words which have tabs in it case '\n': isTokenChar = false; break; // do not break words which are on another line case '\u2425': isTokenChar = false; break; // special char for marking xml elements } return isTokenChar; } private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException { tokenStr = removeElementMarks(tokenStr); tokenStr = removeSpecialSymbols(tokenStr); // e.g. "-" is deleted so that the normalizer works properly String tStrNormed = normalizer.normalize(tokenStr); Token token = new Token(startPos, endPos, tokenStr); token.setContentNorm(tStrNormed); tokens.add(token); } private String removeElementMarks(String inputStr) { String retStr = inputStr.replaceAll("\u2424|\u2425", ""); return retStr; } private String removeSpecialSymbols(String inputStr) { String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen return retStr; } }