comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
comparison
equal deleted inserted replaced
18:dc5e9fcb3fdc 19:4a3641ae14d2
1 /* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */
2
3 /*
4 * Normalization rules for Greek text
5 * [this is a JFlex specification]
6 *
7 * Wolfgang Schmidle
8 * version 2011-08-03
9 *
10 */
11
12 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
13
14
15 /**
16 * This class is a scanner generated by
17 * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
18 * on 05.09.11 10:35 from the specification file
19 * <tt>MpdlNormalizerLexEL.lex</tt>
20 */
21 public class MpdlNormalizerLexEL {
22
23 /** This character denotes the end of file */
24 public static final int YYEOF = -1;
25
26 /** initial size of the lookahead buffer */
27 private static final int ZZ_BUFFERSIZE = 16384;
28
29 /** lexical states */
30 public static final int SEARCH = 6;
31 public static final int DICT = 4;
32 public static final int YYINITIAL = 0;
33 public static final int SIGMA = 8;
34 public static final int DISP = 2;
35
36 /**
37 * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
38 * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
39 * at the beginning of a line
40 * l is of the form l = 2*k, k a non negative integer
41 */
42 private static final int ZZ_LEXSTATE[] = {
43 0, 0, 1, 1, 2, 2, 3, 3, 4, 4
44 };
45
46 /**
47 * Translates characters to character classes
48 */
49 private static final String ZZ_CMAP_PACKED =
50 "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
51 "\32\5\6\0\1\6\2\5\1\6\20\5\1\6\5\5\1\1\1\0"+
52 "\1\1\u032e\0\1\7\1\10\1\11\1\12\15\0\1\4\3\0\1\4"+
53 "\1\30\11\0\1\13\1\14\1\15\u1ba1\0\1\16\1\0\1\20\1\0"+
54 "\1\21\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26\65\0"+
55 "\1\17\17\0\1\22\57\0\1\27\ue00d\0";
56
57 /**
58 * Translates characters to character classes
59 */
60 private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
61
62 /**
63 * Translates DFA states to action switch labels.
64 */
65 private static final int [] ZZ_ACTION = zzUnpackAction();
66
67 private static final String ZZ_ACTION_PACKED_0 =
68 "\5\0\2\1\2\2\1\3\1\4\1\5\1\6\1\7"+
69 "\1\10\1\11\1\12\1\13\12\1\1\14\1\15\1\16"+
70 "\1\0\1\17\1\0\1\20\1\0\1\21\1\0\1\22"+
71 "\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26"+
72 "\1\0\1\27\1\0";
73
74 private static int [] zzUnpackAction() {
75 int [] result = new int[50];
76 int offset = 0;
77 offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
78 return result;
79 }
80
81 private static int zzUnpackAction(String packed, int offset, int [] result) {
82 int i = 0; /* index in packed string */
83 int j = offset; /* index in unpacked array */
84 int l = packed.length();
85 while (i < l) {
86 int count = packed.charAt(i++);
87 int value = packed.charAt(i++);
88 do result[j++] = value; while (--count > 0);
89 }
90 return j;
91 }
92
93
94 /**
95 * Translates a state to a row index in the transition table
96 */
97 private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
98
99 private static final String ZZ_ROWMAP_PACKED_0 =
100 "\0\0\0\31\0\62\0\113\0\144\0\175\0\226\0\175"+
101 "\0\226\0\175\0\175\0\175\0\175\0\175\0\175\0\175"+
102 "\0\175\0\175\0\257\0\310\0\341\0\372\0\u0113\0\u012c"+
103 "\0\u0145\0\u015e\0\u0177\0\u0190\0\175\0\175\0\175\0\u01a9"+
104 "\0\175\0\u01c2\0\175\0\u01db\0\175\0\u01f4\0\175\0\u020d"+
105 "\0\175\0\u0226\0\175\0\u023f\0\175\0\u0258\0\175\0\u0271"+
106 "\0\175\0\u028a";
107
108 private static int [] zzUnpackRowMap() {
109 int [] result = new int[50];
110 int offset = 0;
111 offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
112 return result;
113 }
114
115 private static int zzUnpackRowMap(String packed, int offset, int [] result) {
116 int i = 0; /* index in packed string */
117 int j = offset; /* index in unpacked array */
118 int l = packed.length();
119 while (i < l) {
120 int high = packed.charAt(i++) << 16;
121 result[j++] = high | packed.charAt(i++);
122 }
123 return j;
124 }
125
126 /**
127 * The transition table of the DFA
128 */
129 private static final int [] ZZ_TRANS = zzUnpackTrans();
130
131 private static final String ZZ_TRANS_PACKED_0 =
132 "\1\6\1\7\1\6\1\0\1\6\1\10\1\11\1\12"+
133 "\1\13\1\14\1\15\1\16\1\17\1\20\14\6\1\7"+
134 "\1\6\1\21\1\6\1\10\1\11\1\12\1\13\1\14"+
135 "\1\15\1\16\1\17\1\20\14\6\1\7\1\6\1\22"+
136 "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
137 "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
138 "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\35"+
139 "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
140 "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
141 "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\22"+
142 "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
143 "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
144 "\1\31\1\32\1\33\1\34\1\36\33\0\1\6\31\0"+
145 "\1\37\1\40\23\0\1\40\3\0\1\41\1\42\23\0"+
146 "\1\42\3\0\1\43\1\44\23\0\1\44\3\0\1\45"+
147 "\1\46\23\0\1\46\3\0\1\47\1\50\23\0\1\50"+
148 "\3\0\1\51\1\52\23\0\1\52\3\0\1\53\1\54"+
149 "\23\0\1\54\3\0\1\55\1\56\23\0\1\56\3\0"+
150 "\1\57\1\60\23\0\1\60\3\0\1\61\1\62\23\0"+
151 "\1\62\3\0\1\37\30\0\1\41\30\0\1\43\30\0"+
152 "\1\45\30\0\1\47\30\0\1\51\30\0\1\53\30\0"+
153 "\1\55\30\0\1\57\30\0\1\61\25\0";
154
155 private static int [] zzUnpackTrans() {
156 int [] result = new int[675];
157 int offset = 0;
158 offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
159 return result;
160 }
161
162 private static int zzUnpackTrans(String packed, int offset, int [] result) {
163 int i = 0; /* index in packed string */
164 int j = offset; /* index in unpacked array */
165 int l = packed.length();
166 while (i < l) {
167 int count = packed.charAt(i++);
168 int value = packed.charAt(i++);
169 value--;
170 do result[j++] = value; while (--count > 0);
171 }
172 return j;
173 }
174
175
176 /* error codes */
177 private static final int ZZ_UNKNOWN_ERROR = 0;
178 private static final int ZZ_NO_MATCH = 1;
179 private static final int ZZ_PUSHBACK_2BIG = 2;
180
181 /* error messages for the codes above */
182 private static final String ZZ_ERROR_MSG[] = {
183 "Unkown internal scanner error",
184 "Error: could not match input",
185 "Error: pushback value was too large"
186 };
187
188 /**
189 * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
190 */
191 private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
192
193 private static final String ZZ_ATTRIBUTE_PACKED_0 =
194 "\5\0\1\11\1\1\1\11\1\1\11\11\12\1\3\11"+
195 "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
196 "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
197 "\1\0\1\11\1\0";
198
199 private static int [] zzUnpackAttribute() {
200 int [] result = new int[50];
201 int offset = 0;
202 offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
203 return result;
204 }
205
206 private static int zzUnpackAttribute(String packed, int offset, int [] result) {
207 int i = 0; /* index in packed string */
208 int j = offset; /* index in unpacked array */
209 int l = packed.length();
210 while (i < l) {
211 int count = packed.charAt(i++);
212 int value = packed.charAt(i++);
213 do result[j++] = value; while (--count > 0);
214 }
215 return j;
216 }
217
218 /** the input device */
219 private java.io.Reader zzReader;
220
221 /** the current state of the DFA */
222 private int zzState;
223
224 /** the current lexical state */
225 private int zzLexicalState = YYINITIAL;
226
227 /** this buffer contains the current text to be matched and is
228 the source of the yytext() string */
229 private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
230
231 /** the textposition at the last accepting state */
232 private int zzMarkedPos;
233
234 /** the current text position in the buffer */
235 private int zzCurrentPos;
236
237 /** startRead marks the beginning of the yytext() string in the buffer */
238 private int zzStartRead;
239
240 /** endRead marks the last character in the buffer, that has been read
241 from input */
242 private int zzEndRead;
243
244 /** number of newlines encountered up to the start of the matched text */
245 private int yyline;
246
247 /** the number of characters up to the start of the matched text */
248 private int yychar;
249
250 /**
251 * the number of characters from the last newline up to the start of the
252 * matched text
253 */
254 private int yycolumn;
255
256 /**
257 * zzAtBOL == true <=> the scanner is currently at the beginning of a line
258 */
259 private boolean zzAtBOL = true;
260
261 /** zzAtEOF == true <=> the scanner is at the EOF */
262 private boolean zzAtEOF;
263
264 /** denotes if the user-EOF-code has already been executed */
265 private boolean zzEOFDone;
266
267 /* user code: */
268 private String original = "";
269 private String normalized = "";
270 private int problem = 0;
271
272 private void add (String norm) {
273 original += yytext();
274 normalized += norm;
275 }
276
277 private static final String LB = "[\u002d\u00ad] ";
278
279
280 /**
281 * Creates a new scanner
282 * There is also a java.io.InputStream version of this constructor.
283 *
284 * @param in the java.io.Reader to read input from.
285 */
286 public MpdlNormalizerLexEL(java.io.Reader in) {
287 this.zzReader = in;
288 }
289
290 /**
291 * Creates a new scanner.
292 * There is also java.io.Reader version of this constructor.
293 *
294 * @param in the java.io.Inputstream to read input from.
295 */
296 public MpdlNormalizerLexEL(java.io.InputStream in) {
297 this(new java.io.InputStreamReader(in));
298 }
299
300 /**
301 * Unpacks the compressed character translation table.
302 *
303 * @param packed the packed character translation table
304 * @return the unpacked character translation table
305 */
306 private static char [] zzUnpackCMap(String packed) {
307 char [] map = new char[0x10000];
308 int i = 0; /* index in packed string */
309 int j = 0; /* index in unpacked array */
310 while (i < 112) {
311 int count = packed.charAt(i++);
312 char value = packed.charAt(i++);
313 do map[j++] = value; while (--count > 0);
314 }
315 return map;
316 }
317
318
319 /**
320 * Refills the input buffer.
321 *
322 * @return <code>false</code>, iff there was new input.
323 *
324 * @exception java.io.IOException if any I/O-Error occurs
325 */
326 private boolean zzRefill() throws java.io.IOException {
327
328 /* first: make room (if you can) */
329 if (zzStartRead > 0) {
330 System.arraycopy(zzBuffer, zzStartRead,
331 zzBuffer, 0,
332 zzEndRead-zzStartRead);
333
334 /* translate stored positions */
335 zzEndRead-= zzStartRead;
336 zzCurrentPos-= zzStartRead;
337 zzMarkedPos-= zzStartRead;
338 zzStartRead = 0;
339 }
340
341 /* is the buffer big enough? */
342 if (zzCurrentPos >= zzBuffer.length) {
343 /* if not: blow it up */
344 char newBuffer[] = new char[zzCurrentPos*2];
345 System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
346 zzBuffer = newBuffer;
347 }
348
349 /* finally: fill the buffer with new input */
350 int numRead = zzReader.read(zzBuffer, zzEndRead,
351 zzBuffer.length-zzEndRead);
352
353 if (numRead > 0) {
354 zzEndRead+= numRead;
355 return false;
356 }
357 // unlikely but not impossible: read 0 characters, but not at end of stream
358 if (numRead == 0) {
359 int c = zzReader.read();
360 if (c == -1) {
361 return true;
362 } else {
363 zzBuffer[zzEndRead++] = (char) c;
364 return false;
365 }
366 }
367
368 // numRead < 0
369 return true;
370 }
371
372
373 /**
374 * Closes the input stream.
375 */
376 public final void yyclose() throws java.io.IOException {
377 zzAtEOF = true; /* indicate end of file */
378 zzEndRead = zzStartRead; /* invalidate buffer */
379
380 if (zzReader != null)
381 zzReader.close();
382 }
383
384
385 /**
386 * Resets the scanner to read from a new input stream.
387 * Does not close the old reader.
388 *
389 * All internal variables are reset, the old input stream
390 * <b>cannot</b> be reused (internal buffer is discarded and lost).
391 * Lexical state is set to <tt>ZZ_INITIAL</tt>.
392 *
393 * @param reader the new input stream
394 */
395 public final void yyreset(java.io.Reader reader) {
396 zzReader = reader;
397 zzAtBOL = true;
398 zzAtEOF = false;
399 zzEOFDone = false;
400 zzEndRead = zzStartRead = 0;
401 zzCurrentPos = zzMarkedPos = 0;
402 yyline = yychar = yycolumn = 0;
403 zzLexicalState = YYINITIAL;
404 }
405
406
407 /**
408 * Returns the current lexical state.
409 */
410 public final int yystate() {
411 return zzLexicalState;
412 }
413
414
415 /**
416 * Enters a new lexical state
417 *
418 * @param newState the new lexical state
419 */
420 public final void yybegin(int newState) {
421 zzLexicalState = newState;
422 }
423
424
425 /**
426 * Returns the text matched by the current regular expression.
427 */
428 public final String yytext() {
429 return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
430 }
431
432
433 /**
434 * Returns the character at position <tt>pos</tt> from the
435 * matched text.
436 *
437 * It is equivalent to yytext().charAt(pos), but faster
438 *
439 * @param pos the position of the character to fetch.
440 * A value from 0 to yylength()-1.
441 *
442 * @return the character at position pos
443 */
444 public final char yycharat(int pos) {
445 return zzBuffer[zzStartRead+pos];
446 }
447
448
449 /**
450 * Returns the length of the matched text region.
451 */
452 public final int yylength() {
453 return zzMarkedPos-zzStartRead;
454 }
455
456
457 /**
458 * Reports an error that occured while scanning.
459 *
460 * In a wellformed scanner (no or only correct usage of
461 * yypushback(int) and a match-all fallback rule) this method
462 * will only be called with things that "Can't Possibly Happen".
463 * If this method is called, something is seriously wrong
464 * (e.g. a JFlex bug producing a faulty scanner etc.).
465 *
466 * Usual syntax/scanner level error handling should be done
467 * in error fallback rules.
468 *
469 * @param errorCode the code of the errormessage to display
470 */
471 private void zzScanError(int errorCode) {
472 String message;
473 try {
474 message = ZZ_ERROR_MSG[errorCode];
475 }
476 catch (ArrayIndexOutOfBoundsException e) {
477 message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
478 }
479
480 throw new Error(message);
481 }
482
483
484 /**
485 * Pushes the specified amount of characters back into the input stream.
486 *
487 * They will be read again by then next call of the scanning method
488 *
489 * @param number the number of characters to be read again.
490 * This number must not be greater than yylength()!
491 */
492 public void yypushback(int number) {
493 if ( number > yylength() )
494 zzScanError(ZZ_PUSHBACK_2BIG);
495
496 zzMarkedPos -= number;
497 }
498
499
500 /**
501 * Resumes scanning until the next regular expression is matched,
502 * the end of input is encountered or an I/O-Error occurs.
503 *
504 * @return the next token
505 * @exception java.io.IOException if any I/O-Error occurs
506 */
507 public java.lang.String yylex() throws java.io.IOException {
508 int zzInput;
509 int zzAction;
510
511 // cached fields:
512 int zzCurrentPosL;
513 int zzMarkedPosL;
514 int zzEndReadL = zzEndRead;
515 char [] zzBufferL = zzBuffer;
516 char [] zzCMapL = ZZ_CMAP;
517
518 int [] zzTransL = ZZ_TRANS;
519 int [] zzRowMapL = ZZ_ROWMAP;
520 int [] zzAttrL = ZZ_ATTRIBUTE;
521
522 while (true) {
523 zzMarkedPosL = zzMarkedPos;
524
525 zzAction = -1;
526
527 zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
528
529 zzState = ZZ_LEXSTATE[zzLexicalState];
530
531
532 zzForAction: {
533 while (true) {
534
535 if (zzCurrentPosL < zzEndReadL)
536 zzInput = zzBufferL[zzCurrentPosL++];
537 else if (zzAtEOF) {
538 zzInput = YYEOF;
539 break zzForAction;
540 }
541 else {
542 // store back cached positions
543 zzCurrentPos = zzCurrentPosL;
544 zzMarkedPos = zzMarkedPosL;
545 boolean eof = zzRefill();
546 // get translated positions and possibly new buffer
547 zzCurrentPosL = zzCurrentPos;
548 zzMarkedPosL = zzMarkedPos;
549 zzBufferL = zzBuffer;
550 zzEndReadL = zzEndRead;
551 if (eof) {
552 zzInput = YYEOF;
553 break zzForAction;
554 }
555 else {
556 zzInput = zzBufferL[zzCurrentPosL++];
557 }
558 }
559 int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
560 if (zzNext == -1) break zzForAction;
561 zzState = zzNext;
562
563 int zzAttributes = zzAttrL[zzState];
564 if ( (zzAttributes & 1) == 1 ) {
565 zzAction = zzState;
566 zzMarkedPosL = zzCurrentPosL;
567 if ( (zzAttributes & 8) == 8 ) break zzForAction;
568 }
569
570 }
571 }
572
573 // store back cached position
574 zzMarkedPos = zzMarkedPosL;
575
576 switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
577 case 23:
578 // lookahead expression with fixed base length
579 zzMarkedPos = zzStartRead + 1;
580 { add("ῴ");
581 }
582 case 24: break;
583 case 5:
584 { add("ή");
585 }
586 case 25: break;
587 case 17:
588 // lookahead expression with fixed base length
589 zzMarkedPos = zzStartRead + 1;
590 { add("ή");
591 }
592 case 26: break;
593 case 13:
594 { add("σ");
595 }
596 case 27: break;
597 case 6:
598 { add("ί");
599 }
600 case 28: break;
601 case 1:
602 { add(yytext());
603 }
604 case 29: break;
605 case 22:
606 // lookahead expression with fixed base length
607 zzMarkedPos = zzStartRead + 1;
608 { add("ώ");
609 }
610 case 30: break;
611 case 11:
612 { switch (problem) {
613 case 1: return "";
614 default: return normalized.replaceAll(LB, "");
615 }
616 }
617 case 31: break;
618 case 19:
619 // lookahead expression with fixed base length
620 zzMarkedPos = zzStartRead + 1;
621 { add("ί");
622 }
623 case 32: break;
624 case 15:
625 // lookahead expression with fixed base length
626 zzMarkedPos = zzStartRead + 1;
627 { add("ᾴ");
628 }
629 case 33: break;
630 case 7:
631 { add("ό");
632 }
633 case 34: break;
634 case 14:
635 // lookahead expression with fixed base length
636 zzMarkedPos = zzStartRead + 1;
637 { add("ά");
638 }
639 case 35: break;
640 case 12:
641 { switch (problem) {
642 case 1: return original;
643 default: return normalized.replaceAll(LB, "").toLowerCase();
644 }
645 }
646 case 36: break;
647 case 8:
648 { add("ύ");
649 }
650 case 37: break;
651 case 2:
652 { problem = 1; add(yytext());
653 }
654 case 38: break;
655 case 20:
656 // lookahead expression with fixed base length
657 zzMarkedPos = zzStartRead + 1;
658 { add("ό");
659 }
660 case 39: break;
661 case 3:
662 { add("ά");
663 }
664 case 40: break;
665 case 10:
666 { switch (problem) {
667 case 1: return original;
668 default: return normalized;
669 }
670 }
671 case 41: break;
672 case 9:
673 { add("ώ");
674 }
675 case 42: break;
676 case 16:
677 // lookahead expression with fixed base length
678 zzMarkedPos = zzStartRead + 1;
679 { add("έ");
680 }
681 case 43: break;
682 case 18:
683 // lookahead expression with fixed base length
684 zzMarkedPos = zzStartRead + 1;
685 { add("ῄ");
686 }
687 case 44: break;
688 case 4:
689 { add("έ");
690 }
691 case 45: break;
692 case 21:
693 // lookahead expression with fixed base length
694 zzMarkedPos = zzStartRead + 1;
695 { add("ύ");
696 }
697 case 46: break;
698 default:
699 if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
700 zzAtEOF = true;
701 return null;
702 }
703 else {
704 zzScanError(ZZ_NO_MATCH);
705 }
706 }
707 }
708 }
709
710
711 }