%% %{ /* * Beta Code to Unicode conversion, v. 1.6, Malcolm D. Hyman, 2004-08-27 * [this is a jflex specification] * * Supported subset of Beta Code: * -- Greek alphabet and diacritics * -- escapes not supported at all: $, &, @, <, >, {, } * -- punctuation (%): common items (%-%9 supported) * -- quotation marks ("): mostly supported * -- brackets ([]): [/]-[8/]8 supported * -- text symbols (#): support default, #2, #3, #5 * * Extensions/modifications to Beta Code standard: * -- lowercase alphabetic characters treated as if uppercase * -- "_" (macron over long Greek alpha, iota, upsilon) * -- "^" (caron over short Greek alpha, iota, upsilon) */ static final char BRACEL = '{', BRACER = '}'; static final int DASIA = 1, PSILI = 2; static final int ACUTE = 1, GRAVE = 2, CIRC = 3; static final String ALPHA_TABLE = "ABGDEZHQIKLMNCOPRJSTUFXYW"; static final String INTERNAL_ERROR = "Internal lexer error"; boolean uppercase = false; int pneuma = 0, accent = 0; public int diacritic() throws LexException { int val = (pneuma * 10) + accent; switch (val) { case 10: return 0; case 11: return 4; case 12: return 2; case 13: return 6; case 20: return 1; case 21: return 5; case 22: return 3; case 23: return 7; default: throw new LexException(INTERNAL_ERROR); } } public int vowel1(char c) throws LexException { switch (c) { case 'A': return 0x1f00; case 'E': return 0x1f10; case 'H': return 0x1f20; case 'I': return 0x1f30; case 'O': return 0x1f40; case 'U': return 0x1f50; case 'W': return 0x1f60; default: throw new LexException(INTERNAL_ERROR); } } public int vowel2(char c) throws LexException { switch (c) { case 'A': return 0x1f70; case 'E': return 0x1f72; case 'H': return 0x1f74; case 'I': return 0x1f76; case 'O': return 0x1f78; case 'U': return 0x1f7a; case 'W': return 0x1f7c; default: throw new LexException(INTERNAL_ERROR); } } public int vowel3(char c) throws LexException { switch (c) { case 'A': return 0x1fb6; case 'H': return 0x1fc6; case 'I': return 0x1fd6; case 'U': return 0x1fe6; case 'W': return 0x1ff6; default: throw new LexException("Can't place circumflex on \"" + c + "\" at character " + yychar); } } public int vowel4(char c) throws LexException { switch (c) { case 'A': return 0x1f80; case 'H': return 0x1f90; case 'W': return 0x1fa0; default: throw new LexException("Can't place iota subscript on \"" + c + "\" at character " + yychar); } } public int vowel5(char c) throws LexException { switch (c) { case 'A': return 0x1fb2; case 'H': return 0x1fc2; case 'W': return 0x1ff2; default: throw new LexException("Can't place iota subscript on \"" + c + "\" at character " + yychar); } } public int vowel6(char c) throws LexException { switch (c) { case 'I': return 0x1fd2; case 'U': return 0x1fe2; default: throw new LexException("Can't place dieresis on \"" + c + "\" at character " + yychar); } } public Character compose(int c) { if (uppercase) { uppercase = false; return new Character(Character.toUpperCase((char)c)); } else return new Character((char)c); } %} %class GreekLex %public %implements Lex %yylexthrow LexException %type java.lang.Character %unicode %char VOWEL=[AEHIOUWaehiouw] LETTER=[A-UW-Za-uw-z] NONLETTER=[^A-Za-z#*?\u00ad-] PNEUMA=[()] ACCENT=[/\\=] SUBSCRIPT="|" DIERESIS="+" CARON="^" MACRON="_" %% { {VOWEL}{PNEUMA}{ACCENT}{SUBSCRIPT}? { switch (yycharat(1)) { case '(': pneuma = PSILI; break; case ')': pneuma = DASIA; break; default: throw new LexException(INTERNAL_ERROR); } switch (yycharat(2)) { case '/': accent = ACUTE; break; case '\\': accent = GRAVE; break; case '=': accent = CIRC; break; default: throw new LexException(INTERNAL_ERROR); } char c = Character.toUpperCase(yycharat(0)); if (yylength() == 4) return compose(vowel4(c) + diacritic()); else return compose(vowel1(c) + diacritic()); } "*"{PNEUMA}{ACCENT}{VOWEL} { switch (yycharat(1)) { case '(': pneuma = PSILI; break; case ')': pneuma = DASIA; break; default: throw new LexException(INTERNAL_ERROR); } switch (yycharat(2)) { case '/': accent = ACUTE; break; case '\\': accent = GRAVE; break; case '=': accent = CIRC; break; default: throw new LexException(INTERNAL_ERROR); } uppercase = true; char c = Character.toUpperCase(yycharat(3)); return compose(vowel1(c) + diacritic()); } {VOWEL}{PNEUMA}{SUBSCRIPT}? { switch (yycharat(1)) { case '(': pneuma = PSILI; break; case ')': pneuma = DASIA; break; default: throw new LexException(INTERNAL_ERROR); } accent = 0; char c = Character.toUpperCase(yycharat(0)); if (yylength() == 4) return compose(vowel4(c) + diacritic()); else return compose(vowel1(c) + diacritic()); } "*"{PNEUMA}{VOWEL} { switch (yycharat(1)) { case '(': pneuma = PSILI; break; case ')': pneuma = DASIA; break; default: throw new LexException(INTERNAL_ERROR); } accent = 0; uppercase = true; char c = Character.toUpperCase(yycharat(2)); return compose(vowel1(c) + diacritic()); } {VOWEL}{ACCENT}{SUBSCRIPT} { char c = Character.toUpperCase(yycharat(0)); switch (yycharat(1)) { case '/': return compose(vowel5(c) + 2); case '\\': return compose(vowel5(c)); case '=': return compose(vowel5(c) + 5); default: throw new LexException(INTERNAL_ERROR); } } {VOWEL}{SUBSCRIPT} { char c = Character.toUpperCase(yycharat(0)); return compose(vowel5(c) + 1); } {VOWEL}{ACCENT} { char c = Character.toUpperCase(yycharat(0)); switch (yycharat(1)) { case '/': return compose(vowel2(c) + 1); case '\\': return compose(vowel2(c)); case '=': return compose(vowel3(c)); default: throw new LexException(INTERNAL_ERROR); } } {VOWEL}{ACCENT}{DIERESIS} { char c = Character.toUpperCase(yycharat(0)); switch (yycharat(1)) { case '/': return compose(vowel6(c) + 1); case '\\': return compose(vowel6(c)); case '=': return compose(vowel6(c) + 5); default: throw new LexException(INTERNAL_ERROR); } } {VOWEL}{DIERESIS} { char c = Character.toUpperCase(yycharat(0)); switch (c) { case 'I': return compose(0x03ca); case 'U': return compose(0x03cb); default: throw new LexException("Can't place dieresis on \"" + yycharat(0) + "\" at character " + yychar); } } {VOWEL}({CARON}|{MACRON}) { int add; switch (yycharat(1)) { case '^': add = 0; break; case '_': add = 1; break; default: throw new LexException(INTERNAL_ERROR); } char c = Character.toUpperCase(yycharat(0)); switch (c) { case 'A': return compose(0x1fb0 + add); case 'I': return compose(0x1fd0 + add); case 'U': return compose(0x1fe0 + add); default: throw new LexException("Can't place caron/macron on \"" + yycharat(0) + "\" at character " + yychar); } } "*"{VOWEL}({CARON}|{MACRON}) { int add; switch (yycharat(2)) { case '^': add = 0; break; case '_': add = 1; break; default: throw new LexException(INTERNAL_ERROR); } uppercase = true; char c = Character.toUpperCase(yycharat(1)); switch (c) { case 'A': return compose(0x1fb0 + add); case 'I': return compose(0x1fd0 + add); case 'U': return compose(0x1fe0 + add); default: throw new LexException("Can't place caron/macron on \"" + yycharat(0) + "\" at character " + yychar); } } [Rr]{PNEUMA} { switch (yycharat(1)) { case '(': return compose(0x1fe5); case ')': return compose(0x1fe4); default: throw new LexException(INTERNAL_ERROR); } } [Ss][123] { switch (yycharat(1)) { case '1': return compose(0x03c3); case '2': return compose(0x03c2); case '3': return compose(0x03f2); default: throw new LexException(INTERNAL_ERROR); } } [Ss]/"?"?{NONLETTER} { return compose(0x3c2); } {LETTER} { char c = Character.toUpperCase(yycharat(0)); return compose(ALPHA_TABLE.indexOf(c) + 0x3b1); } [Vv] { return compose(0x03dc); } "#22" { return compose(0x0375); } "#533" { return compose(0x03da); // obsolete in Beta; uncial stigma } "#"[2345]? { if (yylength() == 1) return compose(0x02b9); switch (yycharat(1)) { case '2': return compose(0x03da); case '3': return compose(0x03de); case '4': return compose(0x03db); // obsolete in Beta; alt. koppa/stigma // GREEK SMALL LETTER STIGMA case '5': return compose(0x03e0); default: throw new LexException(INTERNAL_ERROR); } } "*" { uppercase = true; } {PNEUMA}|{ACCENT}|{SUBSCRIPT}|{DIERESIS}|{CARON}|{MACRON} { uppercase = false; throw new LexException("Invalid \"" + yytext() + "\" at character " + yychar); } "?" { return compose(0x0323); } "["[1-8]? { uppercase = false; if (yylength() == 1) return new Character('['); switch (yycharat(1)) { case '1': return new Character('('); case '2': return new Character('\u2039'); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK case '3': return new Character(BRACEL); case '4': return new Character('\u301a'); case '5': return new Character('\u230a'); case '6': return new Character('\u2308'); case '7': return new Character('\u2308'); case '8': return new Character('\u230a'); default: throw new LexException(INTERNAL_ERROR); } } "]"[1-8]? { uppercase = false; if (yylength() == 1) return new Character(']'); switch (yycharat(1)) { case '1': return new Character(')'); case '2': return new Character('\u203a'); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK case '3': return new Character(BRACER); case '4': return new Character('\u301b'); case '5': return new Character('\u230b'); case '6': return new Character('\u2309'); case '7': return new Character('\u230b'); case '8': return new Character('\u2309'); default: throw new LexException(INTERNAL_ERROR); } } "\""[1-7]? { uppercase = false; if (yylength() == 1) return new Character('"'); switch (yycharat(1)) { case '1': return new Character('\u201e'); case '2': return new Character('\u201d'); case '3': return new Character('\''); case '4': return new Character('\u201a'); case '5': return new Character('\u2019'); case '6': return new Character('\u00ab'); case '7': return new Character('\u00bb'); default: throw new LexException(INTERNAL_ERROR); } } "%"[1-9]? { uppercase = false; if (yylength() == 1) return new Character('\u2020'); switch (yycharat(1)) { case '1': return new Character('?'); case '2': return new Character('*'); case '3': return new Character('/'); case '4': return new Character('!'); case '5': return new Character('|'); case '6': return new Character('='); case '7': return new Character('+'); case '8': return new Character('%'); case '9': return new Character('&'); default: throw new LexException(INTERNAL_ERROR); } } ":" { return compose(0x00b7); } .|\n { uppercase = false; return new Character(yycharat(0)); } }