Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlTokenizer.java @ 16:257f67be5c00
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Sep 2011 16:40:57 +0200 |
parents | 5df60f24e997 |
children |
comparison
equal
deleted
inserted
replaced
15:e99964f390e4 | 16:257f67be5c00 |
---|---|
10 | 10 |
11 public class MpdlTokenizer extends Tokenizer { | 11 public class MpdlTokenizer extends Tokenizer { |
12 private static final int MAX_WORD_LEN = 255; | 12 private static final int MAX_WORD_LEN = 255; |
13 private static final int IO_BUFFER_SIZE = 1024; | 13 private static final int IO_BUFFER_SIZE = 1024; |
14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); | 14 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); |
15 private boolean regWithoutSemicolon = false; // hack: in some cases there are words with a semicolon, then the normalization should be without semicolon | |
16 private boolean isInNotWordDelimMode = false; | 15 private boolean isInNotWordDelimMode = false; |
17 private int offset = 0, bufferIndex = 0, dataLen = 0; | 16 private int offset = 0, bufferIndex = 0, dataLen = 0; |
18 private char[] buffer = new char[MAX_WORD_LEN]; | 17 private char[] buffer = new char[MAX_WORD_LEN]; |
19 private char[] ioBuffer = new char[IO_BUFFER_SIZE]; | 18 private char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
20 private MpdlNormalizer normalizer; | 19 private MpdlNormalizer normalizer; |
29 super(input); | 28 super(input); |
30 this.language = language; | 29 this.language = language; |
31 this.normalizer = normalizer; | 30 this.normalizer = normalizer; |
32 } | 31 } |
33 | 32 |
34 public void setRegWithoutSemicolon(boolean regWithoutSemicolon) { | |
35 this.regWithoutSemicolon = regWithoutSemicolon; | |
36 } | |
37 | |
38 public boolean isRegWithoutSemicolon() { | |
39 return regWithoutSemicolon; | |
40 } | |
41 | |
42 /** Returns true iff a character should be included in a token. This | 33 /** Returns true iff a character should be included in a token. This |
43 * tokenizer generates as tokens adjacent sequences of characters which | 34 * tokenizer generates as tokens adjacent sequences of characters which |
44 * satisfy this predicate. Characters for which this is false are used to | 35 * satisfy this predicate. Characters for which this is false are used to |
45 * define token boundaries and are not included in tokens. */ | 36 * define token boundaries and are not included in tokens. */ |
46 protected boolean isTokenChar(char c) { | 37 protected boolean isTokenChar(char c) { |
47 boolean isTokenChar = true; | 38 boolean isTokenChar = true; |
48 if (isRegWithoutSemicolon() && c == ';') // hack: special case for regularization and normalization; feel free to remove it later | |
49 return true; | |
50 switch (c) { | 39 switch (c) { |
51 case ' ': isTokenChar = false; break; | 40 case ' ': isTokenChar = false; break; |
52 case '.': isTokenChar = false; break; | 41 case '.': isTokenChar = false; break; |
53 case ',': isTokenChar = false; break; | 42 case ',': isTokenChar = false; break; |
54 case '!': isTokenChar = false; break; | 43 case '!': isTokenChar = false; break; |