annotate software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Unicode2Buckwalter.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.general;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 %%
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 %{
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
5 /*
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
6 * Betacode to Unicode conversion
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7 */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 %}
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11 %class Unicode2BuckwalterLex
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 %public
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13 %type java.lang.String
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 %unicode
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 %%
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
16
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 "<"[^>]+">" { return yytext(); }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
19
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 "\u0621" { return "'"; } /* Hamza */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
21 "\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 "\u0623" { return ">"; } /* Hamza */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 "\u0624" { return "&"; } /* Hamza */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
24 "\u0625" { return "<"; } /* Alif + HamzaBelow */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
25 "\u0626" { return "}"; } /* Ya + HamzaAbove */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
26 "\u0627" { return "A"; } /* Alif */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
27 "\u0628" { return "b"; } /* Ba */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
28 "\u0629" { return "p"; } /* TaMarbuta */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 "\u062A" { return "t"; } /* Ta */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30 "\u062B" { return "v"; } /* Tha */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 "\u062C" { return "j"; } /* Jeem */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 "\u062D" { return "H"; } /* HHa */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 "\u062E" { return "x"; } /* Kha */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34 "\u062F" { return "d"; } /* Dal */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
35 "\u0630" { return "*"; } /* Thal */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
36 "\u0631" { return "r"; } /* Ra */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
37 "\u0632" { return "z"; } /* Zain */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
38 "\u0633" { return "s"; } /* Seen */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 "\u0634" { return "$"; } /* Sheen */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 "\u0635" { return "S"; } /* Sad */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 "\u0636" { return "D"; } /* DDad */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 "\u0637" { return "T"; } /* TTa */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43 "\u0638" { return "Z"; } /* DTha */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 "\u0639" { return "E"; } /* Ain */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 "\u063A" { return "g"; } /* Ghain */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47 "\u0640" { return "_"; } /* Tatweel */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 "\u0641" { return "f"; } /* Fa */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 "\u0642" { return "q"; } /* Qaf */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 "\u0643" { return "k"; } /* Kaf */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51 "\u0644" { return "l"; } /* Lam */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 "\u0645" { return "m"; } /* Meem */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 "\u0646" { return "n"; } /* Noon */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 "\u0647" { return "h"; } /* Ha */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55 "\u0648" { return "w"; } /* Waw */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
56 "\u0649" { return "Y"; } /* AlifMaksura */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
57 "\u064A" { return "y"; } /* Ya */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
58 "\u064B" { return "F"; } /* Fathatan */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
59 "\u064C" { return "N"; } /* Dammatan */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
60 "\u064D" { return "K"; } /* Kasratan */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
61 "\u064E" { return "a"; } /* Fatha */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
62 "\u064F" { return "u"; } /* Damma */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
63 "\u0650" { return "i"; } /* Kasra */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 "\u0651" { return "~"; } /* Shadda */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 "\u0652" { return "o"; } /* Sukun */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 "\u0653" { return "^"; } /* Maddah */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67 "\u0654" { return "#"; } /* HamzaAbove */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
68
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
69 "\u0670" { return "`"; } /* AlifKhanjareeya */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
70 "\u0671" { return "{"; } /* Alif + HamzatWasl */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
71
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
72 "\u067E" { return "P"; } /* PEH from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 "\u0686" { return "J"; } /* TCHEH from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
74 "\u06A4" { return "V"; } /* VEH from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
75 "\u06AF" { return "G"; } /* GAF from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
76 "\u0698" { return "R"; } /* JEH from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
77 "\u061F" { return "?"; } /* QUESTION MARK from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
78
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
79 "\u06DC" { return ":"; } /* SmallHighSeen */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
80 "\u06DF" { return "@"; } /* SmallHighRoundedZero */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
81
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 "\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
83 "\u06E3" { return ";"; } /* SmallLowSeen */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 "\u06E5" { return ","; } /* SmallWaw */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 "\u06E6" { return "."; } /* SmallYa */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 "\u06E8" { return "!"; } /* SmallHighNoon */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 "\u06EA" { return "-"; } /* EmptyCentreLowStop */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 "\u06EB" { return "+"; } /* EmptyCentreHighStop */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 "\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 "\u06ED" { return "]"; } /* SmallLowMeem */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
91
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
92 [\&_]"vert;" { return "|"; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
93 [\&_]"lpar;" { return "("; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
94 [\&_]"rpar;" { return ")"; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
95 [\_\&]"lt;" { return "&lt;"; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
96 [\_\&]"gt;" { return "&gt;"; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
97 "&#039;" { return "'"; }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
98
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
99 "&"[a-zA-Z]+";" { return yytext(); }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
100
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
101 . { return yytext(); }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
102 \n { return yytext(); }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
103
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
104 /* make problemes */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
105 /* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
106
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
107
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
108 /* double entries */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
109 /* "\u060C" { return ","; } COMMA from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
110 /* "\u061B" { return ";"; } SEMICOLON from AraMorph */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
111
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
112 /* not in buckwalter contained */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
113 /* \u0679 : ARABIC LETTER TTEH */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
114 /* \u0688 : ARABIC LETTER DDAL */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
115 /* \u06A9 : ARABIC LETTER KEHEH */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
116 /* \u0691 : ARABIC LETTER RREH */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
117 /* \u06BA : ARABIC LETTER NOON GHUNNA */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
118 /* \u06BE : ARABIC LETTER HEH DOACHASHMEE */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
119 /* \u06C1 : ARABIC LETTER HEH GOAL */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
120 /* \u06D2 : ARABIC LETTER YEH BARREE */
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
121