Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 5df60f24e997 |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; public class MpdlTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 1024; private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon private boolean isInNotWordDelimMode = false; private int offset = 0, bufferIndex = 0, dataLen = 0; private char[] buffer = new char[MAX_WORD_LEN]; private char[] ioBuffer = new char[IO_BUFFER_SIZE]; private MpdlNormalizer normalizer; private String language; public MpdlTokenizer(Reader input, String language) { super(input); this.language = language; } public MpdlTokenizer(Reader input, String language, MpdlNormalizer normalizer) { super(input); this.language = language; this.normalizer = normalizer; } public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { this.regWithoutSemicolon = regWithoutSemicolon; } public boolean isRegWithoutSemicolon() { return regWithoutSemicolon; } /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ protected boolean isTokenChar(char c) { boolean isTokenChar = true; if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later return true; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; case ',': isTokenChar = false; break; case '!': isTokenChar = false; break; case '?': isTokenChar = false; break; case ';': isTokenChar = false; break; case ':': isTokenChar = false; break; case '(': isTokenChar = false; break; case ')': isTokenChar = false; break; case '[': isTokenChar = false; break; case ']': isTokenChar = false; break; case '<': isTokenChar = false; break; case '>': isTokenChar = false; break; case '&': isTokenChar = false; break; case '+': isTokenChar = false; break; case '"': isTokenChar = false; break; case '„': isTokenChar = false; break; case '“': isTokenChar = false; break; case '«': isTokenChar = false; break; case '»': isTokenChar = false; break; case '\'': isTokenChar = false; break; case '\t': isTokenChar = false; break; // do not break words which have tabs in it case '\n': isTokenChar = false; break; // do not break words which are on another line } return isTokenChar; } protected boolean isTokenCharInNotWordDelimMode(char c) { boolean isTokenCharInNotWordDelimMode = false; if (isInNotWordDelimMode) { switch (c) { case ' ': isTokenCharInNotWordDelimMode = true; break; case '\t': isTokenCharInNotWordDelimMode = true; break; case '\n': isTokenCharInNotWordDelimMode = true; break; } } return isTokenCharInNotWordDelimMode; } protected boolean isSpecialNotWordDelimSymbol(char c) { boolean isSpecialNotWordDelimSymbol = false; switch (c) { case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline } return isSpecialNotWordDelimSymbol; } /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this * to, e.g., lowercase tokens. */ protected char normalize(char c) { return c; } /** Returns the next token in the stream, or null at EOS. */ public final Token next() throws IOException { if (language != null && language.equals("zh")) return nextChinese(); int length = 0; int start = offset; while (true) { final char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { if (length > 0) break; else return null; } else { c = ioBuffer[bufferIndex++]; } if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) { isInNotWordDelimMode = false; } if (isSpecialNotWordDelimSymbol(c)) { isInNotWordDelimMode = true; } if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char if (length == 0) // start of token start = offset - 1; buffer[length++] = normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) // buffer overflow! break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } isInNotWordDelimMode = false; Token newToken = new Token(start, start + length); newToken.setTermBuffer(buffer, 0, length); removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements) if (normalizer != null) { char[] termBuffer = newToken.termBuffer(); int termBufferLength = newToken.termLength(); String tokenText = new String(termBuffer, 0, termBufferLength); try { String normalizedTokenText = normalizer.normalize(tokenText); int normalizedTokenTextLength = normalizedTokenText.length(); char[] normalizedTokenTextBuffer = normalizedTokenText.toCharArray(); newToken.setTermBuffer(normalizedTokenTextBuffer, 0, normalizedTokenTextLength); } catch (ApplicationException e) { throw new IOException(e); } } return newToken; } private Token removeSpecialSymbols(Token token) { char[] termBuffer = token.termBuffer(); int termBufferLength = token.termLength(); String tokenText = new String(termBuffer, 0, termBufferLength); String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements int newTokenTextLength = newTokenText.length(); char[] newTokenTextBuffer = newTokenText.toCharArray(); token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength); return token; } /* * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer * */ private int length; private int start; private final void push(char c) { if (length == 0) start = offset-1; // start of token buffer[length++] = Character.toLowerCase(c); // buffer it } private final Token flush() { if (length>0) { return new Token(new String(buffer, 0, length), start, start+length); } else return null; } public final Token nextChinese() throws IOException { length = 0; start = offset; while (true) { final char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) return flush(); else c = ioBuffer[bufferIndex++]; switch(Character.getType(c)) { case Character.DECIMAL_DIGIT_NUMBER: case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); if (length == MAX_WORD_LEN) return flush(); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; return flush(); } push(c); return flush(); default: if (length>0) return flush(); break; } } } }