%% %{ /* * Arabic Romanization to Buckwalter+ conversion * v. 1.2, Malcolm D. Hyman, 2001-11-29 * [this is a jflex specification] * * See the file "arabic-roman.lex" for descriptions of these characters. * * This converter isn't perfect. It doesn't need to be, since its only * real use is to convert term names typed in Romanized Arabic into * */ String last; private String convert(String s) { yybegin(NONINITIAL); if (s.equals(last)) return "~"; last = s; return s; } %} %class BuckwalterLex %public %implements LexS %type java.lang.String %unicode HAMZA=\u02be NONVOWEL=[^aui\u0101\u012b\u016b] %state NONINITIAL %% \u0101 { return convert("A"); } \u1e6f { return convert("v"); } \u01e7 { return convert("j"); } \u1e25 { return convert("H"); } \u1e2b { return convert("x"); } \u1e0f { return convert("*"); } \u0161 { return convert("$"); } \u1e63 { return convert("S"); } \u1e0d { return convert("D"); } \u1e6d { return convert("T"); } \u1e93 { return convert("Z"); } \u02bf { return convert("E"); } \u0121 { return convert("g"); } \u016b { return convert("w"); } " " { yybegin(YYINITIAL); return " "; } "-" { yybegin(YYINITIAL); return "-"; } /* * Carriers of hamza (cf. Awde & Samano 85--7). */ "u"{HAMZA}/{NONVOWEL} { return convert("u&"); } {HAMZA}/"ay" { return convert("}"); } {HAMZA}/"a" { return convert(">"); } "a"{HAMZA}/{NONVOWEL} { return convert("a>"); } {HAMZA}"u" { return convert("&"); } {HAMZA}"i" { return convert("}"); } "i"{HAMZA} { return convert("i}"); } \u012b{HAMZA} { return convert("y}"); } {HAMZA}\u0101 { return convert("|"); } /* * YYINITIAL is used as word-initial state. The method convert() always * puts the scanner in NONINITIAL state. */ { "a" { return convert(">a"); } "u" { return convert(">u"); } "i" { return convert("<"); } "al-" { yybegin(YYINITIAL); return "Aal-"; } \u012b { return convert("Ay"); } /* * Capital letters. These are only handled word-initially. They are used * for proper names. */ \u0100 { return convert("^A"); } \u1e6e { return convert("^v"); } \u01e6 { return convert("^j"); } \u1e24 { return convert("^H"); } \u1e2a { return convert("^x"); } \u1e0e { return convert("^*"); } \u0160 { return convert("^$"); } \u1e62 { return convert("^S"); } \u1e0c { return convert("^T"); } \u1e6c { return convert("^D"); } \u1e92 { return convert("^Z"); } \u0120 { return convert("^g"); } \u012a { return convert("^Ay"); } [BTJDRZSFQKLMNHWAUI] { return convert("^" + Character.toLowerCase(yycharat(0))); } } /* * Expressions that get matched only in NONINITIAL state. */ { \u012b { return convert("y"); } } \u02be { // hamza-on-the-line return convert("'"); } .|\n { return convert(yycharat(0) + ""); }