Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.java Wed Dec 14 13:57:09 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/Tokenizer.java Tue Nov 27 12:35:19 2012 +0100 @@ -17,13 +17,13 @@ import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; public class Tokenizer extends org.apache.lucene.analysis.Tokenizer { - // variables are copied from Lucene 3.4. CharTokenizer + // variables are copied from Lucene 3.6. CharTokenizer private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static int MAX_WORD_LEN = 4096; // old value was 255 private static int IO_BUFFER_SIZE = 4096; private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_34); + private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_35); private CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); // application variables private String language = "eng"; // default: english @@ -58,14 +58,17 @@ OffsetAttribute offsetAttribute = getAttribute(OffsetAttribute.class); while (incrementToken()) { String term = charTermAttribute.toString(); + term = removeElementMarksAndSpecialSymbols(term); // e.g. also "-" is deleted so that the normalizer works properly int start = offsetAttribute.startOffset(); int end = offsetAttribute.endOffset(); - String normedTerm = normalizer.normalize(term); - Token token = new Token(start, end, normedTerm); + Token token = new Token(start, end, term); + String wordForm = token.getContentOrig(); // word form (lower case) + String normedTerm = normalizer.normalize(wordForm); + token.setContentNorm(normedTerm); tokens.add(token); } - end(); // TODO needed ? - close(); // TODO needed ? + end(); + close(); // close the input reader } catch (IOException e) { throw new ApplicationException(e); } @@ -98,10 +101,10 @@ case '+': isTokenChar = false; break; case '#': isTokenChar = false; break; case '"': isTokenChar = false; break; - case 'ã': isTokenChar = false; break; - case 'Ò': isTokenChar = false; break; - case 'Ç': isTokenChar = false; break; - case 'È': isTokenChar = false; break; + case '„': isTokenChar = false; break; + case '“': isTokenChar = false; break; + case '«': isTokenChar = false; break; + case '»': isTokenChar = false; break; case '\'': isTokenChar = false; break; case '\t': isTokenChar = false; break; // do not break words which have tabs in it case '\n': isTokenChar = false; break; // do not break words which are on another line @@ -121,13 +124,14 @@ } /* - * Code is copied from Lucene 3.4. CharTokenizer.incrementToken() + * Code is copied from Lucene 3.6. CharTokenizer.incrementToken() with bug correction for supplementary chars * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ public boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // this variable is always initialized + int end = -1; char[] buffer = termAtt.buffer(); while (true) { if (bufferIndex >= dataLen) { @@ -146,13 +150,16 @@ } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); - bufferIndex += Character.charCount(c); + int charCount = Character.charCount(c); + bufferIndex += charCount; if (isTokenChar(c)) { // if it's a token char if (length == 0) { // start of token - start = offset + bufferIndex - 1; + start = offset + bufferIndex - charCount; // supplementary chars could have length 1 or 2 + end = start; } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer } + end += charCount; length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test break; @@ -160,12 +167,12 @@ break; // return 'em } termAtt.setLength(length); - offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); + offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); return true; } /* - * Code is copied from Lucene 3.4. CharTokenizer.end() + * Code is copied from Lucene 3.6. CharTokenizer.end() * @see org.apache.lucene.analysis.TokenStream#end() */ @Override @@ -175,7 +182,7 @@ } /* - * Code is copied from Lucene 3.4. CharTokenizer.reset() + * Code is copied from Lucene 3.6. CharTokenizer.reset() * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) */ @Override @@ -190,7 +197,7 @@ } private ArrayList<Token> getTokensByChineseTokenizer(Reader input, String[] normFunctions) throws ApplicationException { - StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_34, input); // is recommended instead of ChineseTokenizer which is deprecated + StandardTokenizer chineseTokenizer = new StandardTokenizer(Version.LUCENE_35, input); // is recommended instead of ChineseTokenizer which is deprecated ArrayList<Token> tokens = new ArrayList<Token>(); try { reset(input); @@ -199,20 +206,26 @@ OffsetAttribute offsetAttribute = chineseTokenizer.getAttribute(OffsetAttribute.class); while (chineseTokenizer.incrementToken()) { String term = charTermAttribute.toString(); - String normedTerm = normalizer.normalize(term); int start = offsetAttribute.startOffset(); int end = offsetAttribute.endOffset(); - Token token = new Token(start, end, normedTerm); + Token token = new Token(start, end, term); + String normedTerm = normalizer.normalize(term); + token.setContentNorm(normedTerm); tokens.add(token); } - chineseTokenizer.end(); // TODO needed ? - chineseTokenizer.close(); // TODO needed ? - end(); // TODO needed ? - close(); // TODO needed ? + chineseTokenizer.end(); + chineseTokenizer.close(); + end(); + close(); } catch (IOException e) { throw new ApplicationException(e); } return tokens; } + private String removeElementMarksAndSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll("\u2424|\u2425| |\n|\t|-|\u00AD", ""); + return retStr; + } + } \ No newline at end of file