Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/TokenizerNew.java Tue Nov 27 12:35:19 2012 +0100 @@ -0,0 +1,162 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; + +/** + * own simple implementation of Tokenizer + * @author jwillenborg + * + */ +public class TokenizerNew { + private StringReader input; + private String language = "eng"; // default: english + private Normalizer normalizer; + private ArrayList<Token> tokens; + + public TokenizerNew(StringReader input, String language) { + this.input = input; + String[] normFunctions = {"norm"}; + this.language = language; + this.normalizer = new Normalizer(normFunctions, language); + } + + public ArrayList<Token> tokenize() throws ApplicationException { + if (Language.getInstance().isChinese(language)) + return tokenizeChinese(); + else + return tokenizeAll(); + } + + private ArrayList<Token> tokenizeAll() throws ApplicationException { + tokens = new ArrayList<Token>(); + try { + int cInt = -1; + StringBuilder tokenStr = new StringBuilder(); + int pos = 0; + boolean isTokenChar = false; // last state + int tokenStart = -1; + int tokenEnd = -1; + while ((cInt = input.read()) != -1) { + char c = (char) cInt; + if (isTokenChar(c)) { + if (! isTokenChar) { + tokenStr = new StringBuilder(); + tokenStart = pos; + } + tokenStr.append(c); + isTokenChar = true; + } else { + if (isTokenChar) { + tokenEnd = pos - 1; + String tStr = tokenStr.toString(); + addToken(tokenStart, tokenEnd, tStr); + } + isTokenChar = false; + } + pos++; + } + // add last token if last char of input is a token char + if (isTokenChar) { + tokenEnd = pos - 1; + String tStr = tokenStr.toString(); + addToken(tokenStart, tokenEnd, tStr); + } + input.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + if (tokens.isEmpty()) + return null; + else + return tokens; + } + + /** + * each token character is a single token + * @return + * @throws ApplicationException + */ + private ArrayList<Token> tokenizeChinese() throws ApplicationException { + tokens = new ArrayList<Token>(); + try { + int cInt = -1; + int pos = 0; + while ((cInt = input.read()) != -1) { + char c = (char) cInt; + if (isTokenChar(c)) { + String tStr = String.valueOf(c); + addToken(pos, pos + 1, tStr); + } + pos++; + } + input.close(); + } catch (IOException e) { + throw new ApplicationException(e); + } + if (tokens.isEmpty()) + return null; + else + return tokens; + } + + private boolean isTokenChar(char c) { + boolean isTokenChar = true; + switch (c) { + case ' ': isTokenChar = false; break; + case '.': isTokenChar = false; break; + case ',': isTokenChar = false; break; + case '!': isTokenChar = false; break; + case '?': isTokenChar = false; break; + case ';': isTokenChar = false; break; + case ':': isTokenChar = false; break; + case '(': isTokenChar = false; break; + case ')': isTokenChar = false; break; + case '[': isTokenChar = false; break; + case ']': isTokenChar = false; break; + case '{': isTokenChar = false; break; + case '}': isTokenChar = false; break; + case '<': isTokenChar = false; break; + case '>': isTokenChar = false; break; + case '/': isTokenChar = false; break; + case '=': isTokenChar = false; break; + case '&': isTokenChar = false; break; + case '+': isTokenChar = false; break; + case '#': isTokenChar = false; break; + case '"': isTokenChar = false; break; + case '„': isTokenChar = false; break; + case '“': isTokenChar = false; break; + case '«': isTokenChar = false; break; + case '»': isTokenChar = false; break; + case '\'': isTokenChar = false; break; + case '\t': isTokenChar = false; break; // do not break words which have tabs in it + case '\n': isTokenChar = false; break; // do not break words which are on another line + case '\u2425': isTokenChar = false; break; // special char for marking xml elements + } + return isTokenChar; + } + + private void addToken(int startPos, int endPos, String tokenStr) throws ApplicationException { + tokenStr = removeElementMarks(tokenStr); + tokenStr = removeSpecialSymbols(tokenStr); // e.g. "-" is deleted so that the normalizer works properly + String tStrNormed = normalizer.normalize(tokenStr); + Token token = new Token(startPos, endPos, tokenStr); + token.setContentNorm(tStrNormed); + tokens.add(token); + } + + private String removeElementMarks(String inputStr) { + String retStr = inputStr.replaceAll("\u2424|\u2425", ""); + return retStr; + } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen + return retStr; + } +} \ No newline at end of file