Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlFilter.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:408254cf2f1d |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; | |
2 | |
3 import java.io.IOException; | |
4 | |
5 import org.apache.lucene.analysis.TokenFilter; | |
6 import org.apache.lucene.analysis.Token; | |
7 import org.apache.lucene.analysis.TokenStream; | |
8 | |
9 public class MpdlFilter extends TokenFilter { | |
10 | |
11 public MpdlFilter(TokenStream in) { | |
12 super(in); | |
13 } | |
14 | |
15 public Token next() throws IOException { | |
16 return getNext(null); | |
17 } | |
18 | |
19 public Token next(Token reusableToken) throws IOException { | |
20 return getNext(reusableToken); | |
21 } | |
22 | |
23 private Token getNext(Token reusableToken) throws IOException { | |
24 Token nextToken = null; | |
25 if (reusableToken == null) | |
26 nextToken = input.next(); | |
27 else | |
28 nextToken = input.next(reusableToken); | |
29 if (nextToken == null) | |
30 return null; | |
31 char[] termBuffer = nextToken.termBuffer(); | |
32 int termBufferLength = nextToken.termLength(); | |
33 int newTermBufferLength = 0; | |
34 // if a hyphen or a newlineChar or tabChar is in the token it is removed | |
35 for(int i=0;i<termBufferLength;i++) { | |
36 char c = termBuffer[i]; | |
37 if (c != '-' && c != '\n' && c != '\t') | |
38 termBuffer[newTermBufferLength++] = c; | |
39 } | |
40 // remove the apostrophe with "s" but not always (e.g. not l'ogni but author's) | |
41 String tokenText = new String(termBuffer, 0, termBufferLength); | |
42 if (tokenText.endsWith("'s") || tokenText.endsWith("'S")) { | |
43 newTermBufferLength = newTermBufferLength - 2; | |
44 } | |
45 nextToken.setTermLength(newTermBufferLength); | |
46 return nextToken; | |
47 } | |
48 } |