Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */ | 1 /* The following code was generated by JFlex 1.4.3 on 28.03.12 18:57 */ |
2 | 2 |
3 /* | 3 /* |
4 * Normalization rules for German text | 4 * Normalization rules for German text |
5 * [this is a JFlex specification] | 5 * [this is a JFlex specification] |
6 * | 6 * |
7 * Wolfgang Schmidle | 7 * Wolfgang Schmidle |
8 * version 2011-08-10 | 8 * version 2011-07-12 |
9 * | 9 * |
10 */ | 10 */ |
11 | 11 |
12 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; | 12 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; |
13 | 13 |
14 | 14 |
15 /** | 15 /** |
16 * This class is a scanner generated by | 16 * This class is a scanner generated by |
17 * <a href="http://www.jflex.de/">JFlex</a> 1.4.3 | 17 * <a href="http://www.jflex.de/">JFlex</a> 1.4.3 |
18 * on 05.09.11 10:34 from the specification file | 18 * on 28.03.12 18:57 from the specification file |
19 * <tt>MpdlNormalizerLexDE.lex</tt> | 19 * <tt>/Users/jwillenborg/test/jflexNew/MpdlNormalizerLexDE.lex</tt> |
20 */ | 20 */ |
21 public class MpdlNormalizerLexDE { | 21 public class MpdlNormalizerLexDE { |
22 | 22 |
23 /** This character denotes the end of file */ | 23 /** This character denotes the end of file */ |
24 public static final int YYEOF = -1; | 24 public static final int YYEOF = -1; |
25 | 25 |
26 /** initial size of the lookahead buffer */ | 26 /** initial size of the lookahead buffer */ |
27 private static final int ZZ_BUFFERSIZE = 16384; | 27 private static final int ZZ_BUFFERSIZE = 16384; |
28 | 28 |
29 /** lexical states */ | 29 /** lexical states */ |
30 public static final int SEARCH = 10; | 30 public static final int SEARCH = 6; |
31 public static final int DICT_ASCII = 6; | |
32 public static final int SEARCH_ASCII = 12; | |
33 public static final int DICT = 4; | 31 public static final int DICT = 4; |
34 public static final int YYINITIAL = 0; | 32 public static final int YYINITIAL = 0; |
33 public static final int CELEX = 8; | |
35 public static final int DISP = 2; | 34 public static final int DISP = 2; |
36 public static final int GRIMM = 8; | 35 public static final int GRIMM = 10; |
37 | 36 |
38 /** | 37 /** |
39 * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l | 38 * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l |
40 * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l | 39 * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l |
41 * at the beginning of a line | 40 * at the beginning of a line |
42 * l is of the form l = 2*k, k a non negative integer | 41 * l is of the form l = 2*k, k a non negative integer |
43 */ | 42 */ |
44 private static final int ZZ_LEXSTATE[] = { | 43 private static final int ZZ_LEXSTATE[] = { |
45 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6 | 44 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 |
46 }; | 45 }; |
47 | 46 |
48 /** | 47 /** |
49 * Translates characters to character classes | 48 * Translates characters to character classes |
50 */ | 49 */ |
51 private static final String ZZ_CMAP_PACKED = | 50 private static final String ZZ_CMAP_PACKED = |
52 "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+ | 51 "\12\0\1\3\25\0\1\2\14\0\1\11\1\0\1\10\1\1\13\0"+ |
53 "\32\4\6\0\1\11\2\4\1\5\12\4\1\13\5\4\1\7\5\4"+ | 52 "\1\6\1\0\1\7\1\0\1\24\32\4\6\0\1\15\2\4\1\5"+ |
54 "\1\1\1\0\1\1\106\0\1\14\21\0\1\15\5\0\1\16\2\0"+ | 53 "\12\4\1\17\5\4\1\13\5\4\1\1\1\0\1\1\106\0\1\20"+ |
55 "\1\17\4\0\1\14\21\0\1\15\5\0\1\16\202\0\1\6\u01e4\0"+ | 54 "\21\0\1\21\5\0\1\22\2\0\1\23\4\0\1\20\21\0\1\21"+ |
56 "\1\12\1\0\1\10\ufc99\0"; | 55 "\5\0\1\22\202\0\1\12\u01e4\0\1\16\1\0\1\14\ufc99\0"; |
57 | 56 |
58 /** | 57 /** |
59 * Translates characters to character classes | 58 * Translates characters to character classes |
60 */ | 59 */ |
61 private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); | 60 private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); |
64 * Translates DFA states to action switch labels. | 63 * Translates DFA states to action switch labels. |
65 */ | 64 */ |
66 private static final int [] ZZ_ACTION = zzUnpackAction(); | 65 private static final int [] ZZ_ACTION = zzUnpackAction(); |
67 | 66 |
68 private static final String ZZ_ACTION_PACKED_0 = | 67 private static final String ZZ_ACTION_PACKED_0 = |
69 "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+ | 68 "\6\0\4\1\1\2\1\3\1\4\3\1\1\5\1\6"+ |
70 "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+ | 69 "\3\3\3\1\1\7\1\10\1\11\1\12\1\13\2\0"+ |
71 "\1\15\1\16\1\17"; | 70 "\1\14\1\15\1\16\1\17\3\0\1\1\2\0"; |
72 | 71 |
73 private static int [] zzUnpackAction() { | 72 private static int [] zzUnpackAction() { |
74 int [] result = new int[30]; | 73 int [] result = new int[41]; |
75 int offset = 0; | 74 int offset = 0; |
76 offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); | 75 offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); |
77 return result; | 76 return result; |
78 } | 77 } |
79 | 78 |
94 * Translates a state to a row index in the transition table | 93 * Translates a state to a row index in the transition table |
95 */ | 94 */ |
96 private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); | 95 private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); |
97 | 96 |
98 private static final String ZZ_ROWMAP_PACKED_0 = | 97 private static final String ZZ_ROWMAP_PACKED_0 = |
99 "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+ | 98 "\0\0\0\25\0\52\0\77\0\124\0\151\0\176\0\223"+ |
100 "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+ | 99 "\0\250\0\275\0\176\0\176\0\176\0\322\0\347\0\374"+ |
101 "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+ | 100 "\0\176\0\176\0\223\0\250\0\275\0\u0111\0\u0126\0\u013b"+ |
102 "\0\167\0\167\0\167\0\167\0\167\0\167"; | 101 "\0\176\0\176\0\176\0\176\0\176\0\u0150\0\250\0\176"+ |
102 "\0\176\0\176\0\176\0\u0165\0\u017a\0\u018f\0\u0165\0\u01a4"+ | |
103 "\0\u01b9"; | |
103 | 104 |
104 private static int [] zzUnpackRowMap() { | 105 private static int [] zzUnpackRowMap() { |
105 int [] result = new int[30]; | 106 int [] result = new int[41]; |
106 int offset = 0; | 107 int offset = 0; |
107 offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); | 108 offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); |
108 return result; | 109 return result; |
109 } | 110 } |
110 | 111 |
123 * The transition table of the DFA | 124 * The transition table of the DFA |
124 */ | 125 */ |
125 private static final int [] ZZ_TRANS = zzUnpackTrans(); | 126 private static final int [] ZZ_TRANS = zzUnpackTrans(); |
126 | 127 |
127 private static final String ZZ_TRANS_PACKED_0 = | 128 private static final String ZZ_TRANS_PACKED_0 = |
128 "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+ | 129 "\1\7\1\10\1\7\1\0\1\7\1\10\1\11\2\7"+ |
129 "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+ | 130 "\1\12\1\13\1\10\1\7\1\10\6\7\1\14\1\7"+ |
130 "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+ | 131 "\1\10\1\7\1\15\1\7\1\10\1\11\2\7\1\12"+ |
131 "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+ | 132 "\1\13\1\16\1\7\1\17\1\7\1\20\4\7\1\14"+ |
132 "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+ | 133 "\1\7\1\10\1\7\1\21\1\7\1\10\1\11\2\7"+ |
133 "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+ | 134 "\1\12\1\13\1\16\1\7\1\17\1\7\1\20\4\7"+ |
134 "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+ | 135 "\1\14\1\7\1\10\1\7\1\22\1\7\1\10\1\11"+ |
135 "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+ | 136 "\2\7\1\12\1\13\1\16\1\7\1\17\1\7\1\20"+ |
136 "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+ | 137 "\4\7\2\14\1\23\1\14\1\21\1\7\1\10\1\24"+ |
137 "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+ | 138 "\2\14\1\25\1\13\1\26\1\14\1\27\1\14\1\30"+ |
138 "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+ | 139 "\1\31\1\32\1\33\1\34\1\14\1\7\1\10\1\7"+ |
139 "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+ | 140 "\1\21\1\7\1\10\1\11\2\7\1\12\1\13\1\16"+ |
140 "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+ | 141 "\1\7\1\17\1\7\1\20\3\7\1\35\1\14\27\0"+ |
141 "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+ | 142 "\1\7\22\0\6\36\2\0\15\36\2\0\1\7\3\0"+ |
142 "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+ | 143 "\1\37\20\0\1\7\11\0\1\40\1\0\1\41\10\0"+ |
143 "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0"; | 144 "\1\7\13\0\1\42\24\0\1\43\10\0\1\7\11\0"+ |
145 "\1\40\1\0\1\33\10\0\1\7\13\0\1\31\24\0"+ | |
146 "\1\32\6\0\6\36\1\0\1\44\1\45\14\36\6\0"+ | |
147 "\1\46\16\0\6\36\1\0\1\47\1\45\14\36\10\0"+ | |
148 "\1\50\14\0\6\51\2\0\23\51\1\0\1\7\15\51"; | |
144 | 149 |
145 private static int [] zzUnpackTrans() { | 150 private static int [] zzUnpackTrans() { |
146 int [] result = new int[255]; | 151 int [] result = new int[462]; |
147 int offset = 0; | 152 int offset = 0; |
148 offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); | 153 offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); |
149 return result; | 154 return result; |
150 } | 155 } |
151 | 156 |
179 * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> | 184 * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code> |
180 */ | 185 */ |
181 private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); | 186 private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); |
182 | 187 |
183 private static final String ZZ_ATTRIBUTE_PACKED_0 = | 188 private static final String ZZ_ATTRIBUTE_PACKED_0 = |
184 "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11"; | 189 "\6\0\1\11\3\1\3\11\3\1\2\11\6\1\5\11"+ |
190 "\2\0\4\11\3\0\1\1\2\0"; | |
185 | 191 |
186 private static int [] zzUnpackAttribute() { | 192 private static int [] zzUnpackAttribute() { |
187 int [] result = new int[30]; | 193 int [] result = new int[41]; |
188 int offset = 0; | 194 int offset = 0; |
189 offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); | 195 offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); |
190 return result; | 196 return result; |
191 } | 197 } |
192 | 198 |
250 | 256 |
251 /** denotes if the user-EOF-code has already been executed */ | 257 /** denotes if the user-EOF-code has already been executed */ |
252 private boolean zzEOFDone; | 258 private boolean zzEOFDone; |
253 | 259 |
254 /* user code: */ | 260 /* user code: */ |
255 public static final int CELEX = DICT_ASCII; | |
256 | |
257 private String original = ""; | 261 private String original = ""; |
258 private String normalized = ""; | 262 private String normalized = ""; |
259 private int problem = 0; | 263 private int problem = 0; |
260 | 264 |
261 private void add (String norm) { | 265 private void add (String norm) { |
294 */ | 298 */ |
295 private static char [] zzUnpackCMap(String packed) { | 299 private static char [] zzUnpackCMap(String packed) { |
296 char [] map = new char[0x10000]; | 300 char [] map = new char[0x10000]; |
297 int i = 0; /* index in packed string */ | 301 int i = 0; /* index in packed string */ |
298 int j = 0; /* index in unpacked array */ | 302 int j = 0; /* index in unpacked array */ |
299 while (i < 88) { | 303 while (i < 98) { |
300 int count = packed.charAt(i++); | 304 int count = packed.charAt(i++); |
301 char value = packed.charAt(i++); | 305 char value = packed.charAt(i++); |
302 do map[j++] = value; while (--count > 0); | 306 do map[j++] = value; while (--count > 0); |
303 } | 307 } |
304 return map; | 308 return map; |
561 | 565 |
562 // store back cached position | 566 // store back cached position |
563 zzMarkedPos = zzMarkedPosL; | 567 zzMarkedPos = zzMarkedPosL; |
564 | 568 |
565 switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { | 569 switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { |
566 case 10: | 570 case 11: |
567 { add("sz"); | 571 { add("sz"); |
568 } | 572 } |
569 case 16: break; | 573 case 16: break; |
570 case 3: | 574 case 3: |
571 { problem = 1; add(yytext()); | 575 { problem = 1; add(yytext()); |
572 } | 576 } |
573 case 17: break; | 577 case 17: break; |
574 case 6: | 578 case 7: |
575 { add("ae"); | 579 { add("ae"); |
576 } | 580 } |
577 case 18: break; | 581 case 18: break; |
578 case 2: | 582 case 2: |
579 { add("s"); | 583 { add("s"); |
588 case 20: break; | 592 case 20: break; |
589 case 13: | 593 case 13: |
590 { add("ü"); | 594 { add("ü"); |
591 } | 595 } |
592 case 21: break; | 596 case 21: break; |
593 case 8: | 597 case 9: |
594 { add("ue"); | 598 { add("ue"); |
595 } | 599 } |
596 case 22: break; | 600 case 22: break; |
597 case 11: | 601 case 6: |
598 { switch (problem) { | 602 { switch (problem) { |
599 case 1: return original; | 603 case 1: return original; |
600 default: return normalized.replaceAll(LB, "").toLowerCase(); | 604 default: return normalized.replaceAll(LB, "").toLowerCase(); |
601 } | 605 } |
602 } | 606 } |
611 case 25: break; | 615 case 25: break; |
612 case 1: | 616 case 1: |
613 { add(yytext()); | 617 { add(yytext()); |
614 } | 618 } |
615 case 26: break; | 619 case 26: break; |
616 case 9: | 620 case 10: |
617 { add("ss"); | 621 { add("ss"); |
618 } | 622 } |
619 case 27: break; | 623 case 27: break; |
620 case 7: | 624 case 8: |
621 { add("oe"); | 625 { add("oe"); |
622 } | 626 } |
623 case 28: break; | 627 case 28: break; |
624 case 15: | 628 case 15: |
625 { add("ö"); | 629 { add("ö"); |