Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | e845310098ba |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CharacterUtils; import org.apache.lucene.util.CharacterUtils.CharacterBuffer; import org.apache.lucene.util.Version; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; public class Tokenizer extends org.apache.lucene.analysis.Tokenizer { // variables are copied from Lucene 3.4. CharTokenizer private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static int MAX_WORD_LEN = 4096; // old value was 255 private static int IO_BUFFER_SIZE = 4096; private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34); private CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); // application variables private String language = "eng"; // default: english private String[] normFunctions = {"norm"}; // default: use norm function private Normalizer normalizer; public Tokenizer(Reader input) { super(input); } public Tokenizer(AttributeSource source, Reader input) { super(source, input); } public void setLanguage(String lang) { String language = Language.getInstance().getLanguageId(lang); this.language = language; } public void setNormFunctions(String[] normFunctions) { this.normFunctions = normFunctions; } public ArrayList<Token> getTokens() throws ApplicationException { if (Language.getInstance().isChinese(language)) { return getTokensByChineseTokenizer(input, normFunctions); } ArrayList<Token> tokens = new ArrayList<Token>(); try { reset(input); CharTermAttribute charTermAttribute = getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = getAttribute(OffsetAttribute.class); while (incrementToken()) { String term = charTermAttribute.toString(); int start = offsetAttribute.startOffset(); int end = offsetAttribute.endOffset(); String normedTerm = normalizer.normalize(term); Token token = new Token(start, end, normedTerm); tokens.add(token); } end(); // TODO needed ? close(); // TODO needed ? } catch (IOException e) { throw new ApplicationException(e); } return tokens; } /** Returns true iff a character should be included in a token. */ protected boolean isTokenChar(int codepoint) { boolean isTokenChar = true; char c = (char) codepoint; switch (c) { case ' ': isTokenChar = false; break; case '.': isTokenChar = false; break; case ',': isTokenChar = false; break; case '!': isTokenChar = false; break; case '?': isTokenChar = false; break; case ';': isTokenChar = false; break; case ':': isTokenChar = false; break; case '(': isTokenChar = false; break; case ')': isTokenChar = false; break; case '[': isTokenChar = false; break; case ']': isTokenChar = false; break; case '{': isTokenChar = false; break; case '}': isTokenChar = false; break; case '<': isTokenChar = false; break; case '>': isTokenChar = false; break; case '/': isTokenChar = false; break; case '=': isTokenChar = false; break; case '&': isTokenChar = false; break; case '+': isTokenChar = false; break; case '#': isTokenChar = false; break; case '"': isTokenChar = false; break; case 'ã': isTokenChar = false; break; case 'Ò': isTokenChar = false; break; case 'Ç': isTokenChar = false; break; case 'È': isTokenChar = false; break; case '\'': isTokenChar = false; break; case '\t': isTokenChar = false; break; // do not break words which have tabs in it case '\n': isTokenChar = false; break; // do not break words which are on another line case '\u2425': isTokenChar = false; break; // special char for marking xml elements } return isTokenChar; } /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this * to, e.g., lowercase tokens. */ protected char normalize(char c) { return c; } protected int normalize(int c) { return c; } /* * Code is copied from Lucene 3.4. CharTokenizer.incrementToken() * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ public boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized char[] buffer = termAtt.buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; if(! charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = correctOffset(offset); return false; } } dataLen = ioBuffer.getLength(); bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); bufferIndex += Character.charCount(c); if (isTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer } length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } termAtt.setLength(length); offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); return true; } /* * Code is copied from Lucene 3.4. CharTokenizer.end() * @see org.apache.lucene.analysis.TokenStream#end() */ @Override public final void end() { // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } /* * Code is copied from Lucene 3.4. CharTokenizer.reset() * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) */ @Override public void reset(Reader input) throws IOException { super.reset(input); bufferIndex = 0; offset = 0; dataLen = 0; finalOffset = 0; ioBuffer.reset(); // make sure to reset the IO buffer!! this.normalizer = new Normalizer(normFunctions, language); } private ArrayList<Token> getTokensByChineseTokenizer(Reader input, String[] normFunctions) throws ApplicationException { StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_34, input); // is recommended instead of ChineseTokenizer which is deprecated ArrayList<Token> tokens = new ArrayList<Token>(); try { reset(input); chineseTokenizer.reset(input); CharTermAttribute charTermAttribute = chineseTokenizer.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = chineseTokenizer.getAttribute(OffsetAttribute.class); while (chineseTokenizer.incrementToken()) { String term = charTermAttribute.toString(); String normedTerm = normalizer.normalize(term); int start = offsetAttribute.startOffset(); int end = offsetAttribute.endOffset(); Token token = new Token(start, end, normedTerm); tokens.add(token); } chineseTokenizer.end(); // TODO needed ? chineseTokenizer.close(); // TODO needed ? end(); // TODO needed ? close(); // TODO needed ? } catch (IOException e) { throw new ApplicationException(e); } return tokens; } }