comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:408254cf2f1d
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
2
3 import java.io.IOException;
4
5 import org.apache.lucene.analysis.TokenFilter;
6 import org.apache.lucene.analysis.Token;
7 import org.apache.lucene.analysis.TokenStream;
8
9 public class MpdlFilter extends TokenFilter {
10
11 public MpdlFilter(TokenStream in) {
12 super(in);
13 }
14
15 public Token next() throws IOException {
16 return getNext(null);
17 }
18
19 public Token next(Token reusableToken) throws IOException {
20 return getNext(reusableToken);
21 }
22
23 private Token getNext(Token reusableToken) throws IOException {
24 Token nextToken = null;
25 if (reusableToken == null)
26 nextToken = input.next();
27 else
28 nextToken = input.next(reusableToken);
29 if (nextToken == null)
30 return null;
31 char[] termBuffer = nextToken.termBuffer();
32 int termBufferLength = nextToken.termLength();
33 int newTermBufferLength = 0;
34 // if a hyphen or a newlineChar or tabChar is in the token it is removed
35 for(int i=0;i<termBufferLength;i++) {
36 char c = termBuffer[i];
37 if (c != '-' && c != '\n' && c != '\t')
38 termBuffer[newTermBufferLength++] = c;
39 }
40 // remove the apostrophe with "s" but not always (e.g. not l'ogni but author's)
41 String tokenText = new String(termBuffer, 0, termBufferLength);
42 if (tokenText.endsWith("'s") || tokenText.endsWith("'S")) {
43 newTermBufferLength = newTermBufferLength - 2;
44 }
45 nextToken.setTermLength(newTermBufferLength);
46 return nextToken;
47 }
48 }