Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 5df60f24e997 |
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java Tue Feb 08 14:54:09 2011 +0100 @@ -11,11 +11,14 @@ public class MpdlTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 1024; - private String language; // TODO make the tokenizer language dependent + private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); + private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon + private boolean isInNotWordDelimMode = false; private int offset = 0, bufferIndex = 0, dataLen = 0; private char[] buffer = new char[MAX_WORD_LEN]; private char[] ioBuffer = new char[IO_BUFFER_SIZE]; private MpdlNormalizer normalizer; + private String language; public MpdlTokenizer(Reader input, String language) { super(input); @@ -28,12 +31,22 @@ this.normalizer = normalizer; } + public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { + this.regWithoutSemicolon = regWithoutSemicolon; + } + + public boolean isRegWithoutSemicolon() { + return regWithoutSemicolon; + } + /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ protected boolean isTokenChar(char c) { boolean isTokenChar = true; + if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later + return true; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; @@ -51,12 +64,37 @@ case '&': isTokenChar = false; break; case '+': isTokenChar = false; break; case '"': isTokenChar = false; break; + case '„': isTokenChar = false; break; + case '“': isTokenChar = false; break; + case '«': isTokenChar = false; break; + case '»': isTokenChar = false; break; case '\'': isTokenChar = false; break; - // case '\t': isTokenChar = false; break; - // case '\n': isTokenChar = false; break; // do not break words which are on another line + case '\t': isTokenChar = false; break; // do not break words which have tabs in it + case '\n': isTokenChar = false; break; // do not break words which are on another line } return isTokenChar; } + + protected boolean isTokenCharInNotWordDelimMode(char c) { + boolean isTokenCharInNotWordDelimMode = false; + if (isInNotWordDelimMode) { + switch (c) { + case ' ': isTokenCharInNotWordDelimMode = true; break; + case '\t': isTokenCharInNotWordDelimMode = true; break; + case '\n': isTokenCharInNotWordDelimMode = true; break; + } + } + return isTokenCharInNotWordDelimMode; + } + + protected boolean isSpecialNotWordDelimSymbol(char c) { + boolean isSpecialNotWordDelimSymbol = false; + switch (c) { + case '\u2424': isSpecialNotWordDelimSymbol = true; break; // unicode character for newline + } + return isSpecialNotWordDelimSymbol; + } + /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this @@ -67,6 +105,8 @@ /** Returns the next token in the stream, or null at EOS. */ public final Token next() throws IOException { + if (language != null && language.equals("zh")) + return nextChinese(); int length = 0; int start = offset; while (true) { @@ -84,7 +124,13 @@ } else { c = ioBuffer[bufferIndex++]; } - if (isTokenChar(c)) { // if it's a token char + if (isInNotWordDelimMode && isTokenChar(c) && (! isSpecialNotWordDelimSymbol(c))) { + isInNotWordDelimMode = false; + } + if (isSpecialNotWordDelimSymbol(c)) { + isInNotWordDelimMode = true; + } + if (isTokenChar(c) || isTokenCharInNotWordDelimMode(c)) { // if it's a token char if (length == 0) // start of token start = offset - 1; buffer[length++] = normalize(c); // buffer it, normalized @@ -93,8 +139,10 @@ } else if (length > 0) // at non-Letter w/ chars break; // return 'em } + isInNotWordDelimMode = false; Token newToken = new Token(start, start + length); newToken.setTermBuffer(buffer, 0, length); + removeSpecialSymbols(newToken); // remove some special symbols in token (e.g. symbol for word delimiting xml elements) if (normalizer != null) { char[] termBuffer = newToken.termBuffer(); int termBufferLength = newToken.termLength(); @@ -110,4 +158,75 @@ } return newToken; } + + private Token removeSpecialSymbols(Token token) { + char[] termBuffer = token.termBuffer(); + int termBufferLength = token.termLength(); + String tokenText = new String(termBuffer, 0, termBufferLength); + String newTokenText = tokenText.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); // a symbol which marks word delimiting xml elements + int newTokenTextLength = newTokenText.length(); + char[] newTokenTextBuffer = newTokenText.toCharArray(); + token.setTermBuffer(newTokenTextBuffer, 0, newTokenTextLength); + return token; + } + + + + /* + * chinese Tokenizer: taken from org.apache.lucene.analysis.cn.ChineseTokenizer + * + */ + private int length; + private int start; + + private final void push(char c) { + if (length == 0) start = offset-1; // start of token + buffer[length++] = Character.toLowerCase(c); // buffer it + } + + private final Token flush() { + if (length>0) { + return new Token(new String(buffer, 0, length), start, start+length); + } + else + return null; + } + + public final Token nextChinese() throws IOException { + length = 0; + start = offset; + while (true) { + final char c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + } + if (dataLen == -1) + return flush(); + else + c = ioBuffer[bufferIndex++]; + switch(Character.getType(c)) { + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LOWERCASE_LETTER: + case Character.UPPERCASE_LETTER: + push(c); + if (length == MAX_WORD_LEN) + return flush(); + break; + case Character.OTHER_LETTER: + if (length>0) { + bufferIndex--; + offset--; + return flush(); + } + push(c); + return flush(); + default: + if (length>0) + return flush(); + break; + } + } + } }