comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
comparison
equal deleted inserted replaced
18:dc5e9fcb3fdc 19:4a3641ae14d2
1 /* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
2
3 /*
4 * Normalization rules for Dutch text
5 * [this is a JFlex specification]
6 *
7 * Wolfgang Schmidle
8 * version 2011-07-12
9 *
10 */
11
12 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
13
14
15 /**
16 * This class is a scanner generated by
17 * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
18 * on 21.07.11 11:22 from the specification file
19 * <tt>MpdlNormalizerLexNL.lex</tt>
20 */
21 public class MpdlNormalizerLexNL {
22
23 /** This character denotes the end of file */
24 public static final int YYEOF = -1;
25
26 /** initial size of the lookahead buffer */
27 private static final int ZZ_BUFFERSIZE = 16384;
28
29 /** lexical states */
30 public static final int SEARCH = 6;
31 public static final int DICT = 4;
32 public static final int YYINITIAL = 0;
33 public static final int DISP = 2;
34
35 /**
36 * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
37 * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
38 * at the beginning of a line
39 * l is of the form l = 2*k, k a non negative integer
40 */
41 private static final int ZZ_LEXSTATE[] = {
42 0, 0, 1, 1, 2, 2, 3, 3
43 };
44
45 /**
46 * Translates characters to character classes
47 */
48 private static final String ZZ_CMAP_PACKED =
49 "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
50 "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
51 "\u0101\0\1\4\ufe80\0";
52
53 /**
54 * Translates characters to character classes
55 */
56 private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
57
58 /**
59 * Translates DFA states to action switch labels.
60 */
61 private static final int [] ZZ_ACTION = zzUnpackAction();
62
63 private static final String ZZ_ACTION_PACKED_0 =
64 "\4\0\2\1\1\2\1\3\1\4\1\5\1\6";
65
66 private static int [] zzUnpackAction() {
67 int [] result = new int[11];
68 int offset = 0;
69 offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
70 return result;
71 }
72
73 private static int zzUnpackAction(String packed, int offset, int [] result) {
74 int i = 0; /* index in packed string */
75 int j = offset; /* index in unpacked array */
76 int l = packed.length();
77 while (i < l) {
78 int count = packed.charAt(i++);
79 int value = packed.charAt(i++);
80 do result[j++] = value; while (--count > 0);
81 }
82 return j;
83 }
84
85
86 /**
87 * Translates a state to a row index in the transition table
88 */
89 private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
90
91 private static final String ZZ_ROWMAP_PACKED_0 =
92 "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+
93 "\0\30\0\30\0\30";
94
95 private static int [] zzUnpackRowMap() {
96 int [] result = new int[11];
97 int offset = 0;
98 offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
99 return result;
100 }
101
102 private static int zzUnpackRowMap(String packed, int offset, int [] result) {
103 int i = 0; /* index in packed string */
104 int j = offset; /* index in unpacked array */
105 int l = packed.length();
106 while (i < l) {
107 int high = packed.charAt(i++) << 16;
108 result[j++] = high | packed.charAt(i++);
109 }
110 return j;
111 }
112
113 /**
114 * The transition table of the DFA
115 */
116 private static final int [] ZZ_TRANS = zzUnpackTrans();
117
118 private static final String ZZ_TRANS_PACKED_0 =
119 "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+
120 "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+
121 "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+
122 "\10\0\1\5\3\0";
123
124 private static int [] zzUnpackTrans() {
125 int [] result = new int[36];
126 int offset = 0;
127 offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
128 return result;
129 }
130
131 private static int zzUnpackTrans(String packed, int offset, int [] result) {
132 int i = 0; /* index in packed string */
133 int j = offset; /* index in unpacked array */
134 int l = packed.length();
135 while (i < l) {
136 int count = packed.charAt(i++);
137 int value = packed.charAt(i++);
138 value--;
139 do result[j++] = value; while (--count > 0);
140 }
141 return j;
142 }
143
144
145 /* error codes */
146 private static final int ZZ_UNKNOWN_ERROR = 0;
147 private static final int ZZ_NO_MATCH = 1;
148 private static final int ZZ_PUSHBACK_2BIG = 2;
149
150 /* error messages for the codes above */
151 private static final String ZZ_ERROR_MSG[] = {
152 "Unkown internal scanner error",
153 "Error: could not match input",
154 "Error: pushback value was too large"
155 };
156
157 /**
158 * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
159 */
160 private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
161
162 private static final String ZZ_ATTRIBUTE_PACKED_0 =
163 "\4\0\1\11\1\1\5\11";
164
165 private static int [] zzUnpackAttribute() {
166 int [] result = new int[11];
167 int offset = 0;
168 offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
169 return result;
170 }
171
172 private static int zzUnpackAttribute(String packed, int offset, int [] result) {
173 int i = 0; /* index in packed string */
174 int j = offset; /* index in unpacked array */
175 int l = packed.length();
176 while (i < l) {
177 int count = packed.charAt(i++);
178 int value = packed.charAt(i++);
179 do result[j++] = value; while (--count > 0);
180 }
181 return j;
182 }
183
184 /** the input device */
185 private java.io.Reader zzReader;
186
187 /** the current state of the DFA */
188 private int zzState;
189
190 /** the current lexical state */
191 private int zzLexicalState = YYINITIAL;
192
193 /** this buffer contains the current text to be matched and is
194 the source of the yytext() string */
195 private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
196
197 /** the textposition at the last accepting state */
198 private int zzMarkedPos;
199
200 /** the current text position in the buffer */
201 private int zzCurrentPos;
202
203 /** startRead marks the beginning of the yytext() string in the buffer */
204 private int zzStartRead;
205
206 /** endRead marks the last character in the buffer, that has been read
207 from input */
208 private int zzEndRead;
209
210 /** number of newlines encountered up to the start of the matched text */
211 private int yyline;
212
213 /** the number of characters up to the start of the matched text */
214 private int yychar;
215
216 /**
217 * the number of characters from the last newline up to the start of the
218 * matched text
219 */
220 private int yycolumn;
221
222 /**
223 * zzAtBOL == true <=> the scanner is currently at the beginning of a line
224 */
225 private boolean zzAtBOL = true;
226
227 /** zzAtEOF == true <=> the scanner is at the EOF */
228 private boolean zzAtEOF;
229
230 /** denotes if the user-EOF-code has already been executed */
231 private boolean zzEOFDone;
232
233 /* user code: */
234 private String original = "";
235 private String normalized = "";
236 private int problem = 0;
237
238 private void add (String norm) {
239 original += yytext();
240 normalized += norm;
241 }
242
243 private static final String LB = "[\u002d\u00ad] ";
244
245
246 /**
247 * Creates a new scanner
248 * There is also a java.io.InputStream version of this constructor.
249 *
250 * @param in the java.io.Reader to read input from.
251 */
252 public MpdlNormalizerLexNL(java.io.Reader in) {
253 this.zzReader = in;
254 }
255
256 /**
257 * Creates a new scanner.
258 * There is also java.io.Reader version of this constructor.
259 *
260 * @param in the java.io.Inputstream to read input from.
261 */
262 public MpdlNormalizerLexNL(java.io.InputStream in) {
263 this(new java.io.InputStreamReader(in));
264 }
265
266 /**
267 * Unpacks the compressed character translation table.
268 *
269 * @param packed the packed character translation table
270 * @return the unpacked character translation table
271 */
272 private static char [] zzUnpackCMap(String packed) {
273 char [] map = new char[0x10000];
274 int i = 0; /* index in packed string */
275 int j = 0; /* index in unpacked array */
276 while (i < 46) {
277 int count = packed.charAt(i++);
278 char value = packed.charAt(i++);
279 do map[j++] = value; while (--count > 0);
280 }
281 return map;
282 }
283
284
285 /**
286 * Refills the input buffer.
287 *
288 * @return <code>false</code>, iff there was new input.
289 *
290 * @exception java.io.IOException if any I/O-Error occurs
291 */
292 private boolean zzRefill() throws java.io.IOException {
293
294 /* first: make room (if you can) */
295 if (zzStartRead > 0) {
296 System.arraycopy(zzBuffer, zzStartRead,
297 zzBuffer, 0,
298 zzEndRead-zzStartRead);
299
300 /* translate stored positions */
301 zzEndRead-= zzStartRead;
302 zzCurrentPos-= zzStartRead;
303 zzMarkedPos-= zzStartRead;
304 zzStartRead = 0;
305 }
306
307 /* is the buffer big enough? */
308 if (zzCurrentPos >= zzBuffer.length) {
309 /* if not: blow it up */
310 char newBuffer[] = new char[zzCurrentPos*2];
311 System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
312 zzBuffer = newBuffer;
313 }
314
315 /* finally: fill the buffer with new input */
316 int numRead = zzReader.read(zzBuffer, zzEndRead,
317 zzBuffer.length-zzEndRead);
318
319 if (numRead > 0) {
320 zzEndRead+= numRead;
321 return false;
322 }
323 // unlikely but not impossible: read 0 characters, but not at end of stream
324 if (numRead == 0) {
325 int c = zzReader.read();
326 if (c == -1) {
327 return true;
328 } else {
329 zzBuffer[zzEndRead++] = (char) c;
330 return false;
331 }
332 }
333
334 // numRead < 0
335 return true;
336 }
337
338
339 /**
340 * Closes the input stream.
341 */
342 public final void yyclose() throws java.io.IOException {
343 zzAtEOF = true; /* indicate end of file */
344 zzEndRead = zzStartRead; /* invalidate buffer */
345
346 if (zzReader != null)
347 zzReader.close();
348 }
349
350
351 /**
352 * Resets the scanner to read from a new input stream.
353 * Does not close the old reader.
354 *
355 * All internal variables are reset, the old input stream
356 * <b>cannot</b> be reused (internal buffer is discarded and lost).
357 * Lexical state is set to <tt>ZZ_INITIAL</tt>.
358 *
359 * @param reader the new input stream
360 */
361 public final void yyreset(java.io.Reader reader) {
362 zzReader = reader;
363 zzAtBOL = true;
364 zzAtEOF = false;
365 zzEOFDone = false;
366 zzEndRead = zzStartRead = 0;
367 zzCurrentPos = zzMarkedPos = 0;
368 yyline = yychar = yycolumn = 0;
369 zzLexicalState = YYINITIAL;
370 }
371
372
373 /**
374 * Returns the current lexical state.
375 */
376 public final int yystate() {
377 return zzLexicalState;
378 }
379
380
381 /**
382 * Enters a new lexical state
383 *
384 * @param newState the new lexical state
385 */
386 public final void yybegin(int newState) {
387 zzLexicalState = newState;
388 }
389
390
391 /**
392 * Returns the text matched by the current regular expression.
393 */
394 public final String yytext() {
395 return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
396 }
397
398
399 /**
400 * Returns the character at position <tt>pos</tt> from the
401 * matched text.
402 *
403 * It is equivalent to yytext().charAt(pos), but faster
404 *
405 * @param pos the position of the character to fetch.
406 * A value from 0 to yylength()-1.
407 *
408 * @return the character at position pos
409 */
410 public final char yycharat(int pos) {
411 return zzBuffer[zzStartRead+pos];
412 }
413
414
415 /**
416 * Returns the length of the matched text region.
417 */
418 public final int yylength() {
419 return zzMarkedPos-zzStartRead;
420 }
421
422
423 /**
424 * Reports an error that occured while scanning.
425 *
426 * In a wellformed scanner (no or only correct usage of
427 * yypushback(int) and a match-all fallback rule) this method
428 * will only be called with things that "Can't Possibly Happen".
429 * If this method is called, something is seriously wrong
430 * (e.g. a JFlex bug producing a faulty scanner etc.).
431 *
432 * Usual syntax/scanner level error handling should be done
433 * in error fallback rules.
434 *
435 * @param errorCode the code of the errormessage to display
436 */
437 private void zzScanError(int errorCode) {
438 String message;
439 try {
440 message = ZZ_ERROR_MSG[errorCode];
441 }
442 catch (ArrayIndexOutOfBoundsException e) {
443 message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
444 }
445
446 throw new Error(message);
447 }
448
449
450 /**
451 * Pushes the specified amount of characters back into the input stream.
452 *
453 * They will be read again by then next call of the scanning method
454 *
455 * @param number the number of characters to be read again.
456 * This number must not be greater than yylength()!
457 */
458 public void yypushback(int number) {
459 if ( number > yylength() )
460 zzScanError(ZZ_PUSHBACK_2BIG);
461
462 zzMarkedPos -= number;
463 }
464
465
466 /**
467 * Resumes scanning until the next regular expression is matched,
468 * the end of input is encountered or an I/O-Error occurs.
469 *
470 * @return the next token
471 * @exception java.io.IOException if any I/O-Error occurs
472 */
473 public java.lang.String yylex() throws java.io.IOException {
474 int zzInput;
475 int zzAction;
476
477 // cached fields:
478 int zzCurrentPosL;
479 int zzMarkedPosL;
480 int zzEndReadL = zzEndRead;
481 char [] zzBufferL = zzBuffer;
482 char [] zzCMapL = ZZ_CMAP;
483
484 int [] zzTransL = ZZ_TRANS;
485 int [] zzRowMapL = ZZ_ROWMAP;
486 int [] zzAttrL = ZZ_ATTRIBUTE;
487
488 while (true) {
489 zzMarkedPosL = zzMarkedPos;
490
491 zzAction = -1;
492
493 zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
494
495 zzState = ZZ_LEXSTATE[zzLexicalState];
496
497
498 zzForAction: {
499 while (true) {
500
501 if (zzCurrentPosL < zzEndReadL)
502 zzInput = zzBufferL[zzCurrentPosL++];
503 else if (zzAtEOF) {
504 zzInput = YYEOF;
505 break zzForAction;
506 }
507 else {
508 // store back cached positions
509 zzCurrentPos = zzCurrentPosL;
510 zzMarkedPos = zzMarkedPosL;
511 boolean eof = zzRefill();
512 // get translated positions and possibly new buffer
513 zzCurrentPosL = zzCurrentPos;
514 zzMarkedPosL = zzMarkedPos;
515 zzBufferL = zzBuffer;
516 zzEndReadL = zzEndRead;
517 if (eof) {
518 zzInput = YYEOF;
519 break zzForAction;
520 }
521 else {
522 zzInput = zzBufferL[zzCurrentPosL++];
523 }
524 }
525 int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
526 if (zzNext == -1) break zzForAction;
527 zzState = zzNext;
528
529 int zzAttributes = zzAttrL[zzState];
530 if ( (zzAttributes & 1) == 1 ) {
531 zzAction = zzState;
532 zzMarkedPosL = zzCurrentPosL;
533 if ( (zzAttributes & 8) == 8 ) break zzForAction;
534 }
535
536 }
537 }
538
539 // store back cached position
540 zzMarkedPos = zzMarkedPosL;
541
542 switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
543 case 5:
544 { switch (problem) {
545 case 1: return "";
546 default: return normalized.replaceAll(LB, "");
547 }
548 }
549 case 7: break;
550 case 2:
551 { problem = 1; add(yytext());
552 }
553 case 8: break;
554 case 4:
555 { add("s");
556 }
557 case 9: break;
558 case 3:
559 { switch (problem) {
560 case 1: return original;
561 default: return normalized;
562 }
563 }
564 case 10: break;
565 case 6:
566 { switch (problem) {
567 case 1: return original;
568 default: return normalized.replaceAll(LB, "").toLowerCase();
569 }
570 }
571 case 11: break;
572 case 1:
573 { add(yytext());
574 }
575 case 12: break;
576 default:
577 if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
578 zzAtEOF = true;
579 return null;
580 }
581 else {
582 zzScanError(ZZ_NO_MATCH);
583 }
584 }
585 }
586 }
587
588
589 }