comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 5df60f24e997
children
comparison
equal deleted inserted replaced
15:e99964f390e4 16:257f67be5c00
10 10
11 public class MpdlTokenizer extends Tokenizer { 11 public class MpdlTokenizer extends Tokenizer {
12 private static final int MAX_WORD_LEN = 255; 12 private static final int MAX_WORD_LEN = 255;
13 private static final int IO_BUFFER_SIZE = 1024; 13 private static final int IO_BUFFER_SIZE = 1024;
14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); 14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
15 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon
16 private boolean isInNotWordDelimMode = false; 15 private boolean isInNotWordDelimMode = false;
17 private int offset = 0, bufferIndex = 0, dataLen = 0; 16 private int offset = 0, bufferIndex = 0, dataLen = 0;
18 private char[] buffer = new char[MAX_WORD_LEN]; 17 private char[] buffer = new char[MAX_WORD_LEN];
19 private char[] ioBuffer = new char[IO_BUFFER_SIZE]; 18 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
20 private MpdlNormalizer normalizer; 19 private MpdlNormalizer normalizer;
29 super(input); 28 super(input);
30 this.language = language; 29 this.language = language;
31 this.normalizer = normalizer; 30 this.normalizer = normalizer;
32 } 31 }
33 32
34 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) {
35 this.regWithoutSemicolon = regWithoutSemicolon;
36 }
37
38 public boolean isRegWithoutSemicolon() {
39 return regWithoutSemicolon;
40 }
41
42 /** Returns true iff a character should be included in a token. This 33 /** Returns true iff a character should be included in a token. This
43 * tokenizer generates as tokens adjacent sequences of characters which 34 * tokenizer generates as tokens adjacent sequences of characters which
44 * satisfy this predicate. Characters for which this is false are used to 35 * satisfy this predicate. Characters for which this is false are used to
45 * define token boundaries and are not included in tokens. */ 36 * define token boundaries and are not included in tokens. */
46 protected boolean isTokenChar(char c) { 37 protected boolean isTokenChar(char c) {
47 boolean isTokenChar = true; 38 boolean isTokenChar = true;
48 if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later
49 return true;
50 switch (c) { 39 switch (c) {
51 case ' ': isTokenChar = false; break; 40 case ' ': isTokenChar = false; break;
52 case '.': isTokenChar = false; break; 41 case '.': isTokenChar = false; break;
53 case ',': isTokenChar = false; break; 42 case ',': isTokenChar = false; break;
54 case '!': isTokenChar = false; break; 43 case '!': isTokenChar = false; break;