comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents
children
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
1 package de.mpg.mpiwg.berlin.mpdl.lt.general;
2
3 %%
4 %{
5 /*
6 * Betacode to Unicode conversion
7 */
8
9 %}
10
11 %class Unicode2BuckwalterLex
12 %public
13 %type java.lang.String
14 %unicode
15 %%
16
17
18 "<"[^>]+">" { return yytext(); }
19
20 "\u0621" { return "'"; } /* Hamza */
21 "\u0622" { return "|"; } /* ALEF WITH MADDA ABOVE from AraMorph */
22 "\u0623" { return ">"; } /* Hamza */
23 "\u0624" { return "&"; } /* Hamza */
24 "\u0625" { return "<"; } /* Alif + HamzaBelow */
25 "\u0626" { return "}"; } /* Ya + HamzaAbove */
26 "\u0627" { return "A"; } /* Alif */
27 "\u0628" { return "b"; } /* Ba */
28 "\u0629" { return "p"; } /* TaMarbuta */
29 "\u062A" { return "t"; } /* Ta */
30 "\u062B" { return "v"; } /* Tha */
31 "\u062C" { return "j"; } /* Jeem */
32 "\u062D" { return "H"; } /* HHa */
33 "\u062E" { return "x"; } /* Kha */
34 "\u062F" { return "d"; } /* Dal */
35 "\u0630" { return "*"; } /* Thal */
36 "\u0631" { return "r"; } /* Ra */
37 "\u0632" { return "z"; } /* Zain */
38 "\u0633" { return "s"; } /* Seen */
39 "\u0634" { return "$"; } /* Sheen */
40 "\u0635" { return "S"; } /* Sad */
41 "\u0636" { return "D"; } /* DDad */
42 "\u0637" { return "T"; } /* TTa */
43 "\u0638" { return "Z"; } /* DTha */
44 "\u0639" { return "E"; } /* Ain */
45 "\u063A" { return "g"; } /* Ghain */
46
47 "\u0640" { return "_"; } /* Tatweel */
48 "\u0641" { return "f"; } /* Fa */
49 "\u0642" { return "q"; } /* Qaf */
50 "\u0643" { return "k"; } /* Kaf */
51 "\u0644" { return "l"; } /* Lam */
52 "\u0645" { return "m"; } /* Meem */
53 "\u0646" { return "n"; } /* Noon */
54 "\u0647" { return "h"; } /* Ha */
55 "\u0648" { return "w"; } /* Waw */
56 "\u0649" { return "Y"; } /* AlifMaksura */
57 "\u064A" { return "y"; } /* Ya */
58 "\u064B" { return "F"; } /* Fathatan */
59 "\u064C" { return "N"; } /* Dammatan */
60 "\u064D" { return "K"; } /* Kasratan */
61 "\u064E" { return "a"; } /* Fatha */
62 "\u064F" { return "u"; } /* Damma */
63 "\u0650" { return "i"; } /* Kasra */
64 "\u0651" { return "~"; } /* Shadda */
65 "\u0652" { return "o"; } /* Sukun */
66 "\u0653" { return "^"; } /* Maddah */
67 "\u0654" { return "#"; } /* HamzaAbove */
68
69 "\u0670" { return "`"; } /* AlifKhanjareeya */
70 "\u0671" { return "{"; } /* Alif + HamzatWasl */
71
72 "\u067E" { return "P"; } /* PEH from AraMorph */
73 "\u0686" { return "J"; } /* TCHEH from AraMorph */
74 "\u06A4" { return "V"; } /* VEH from AraMorph */
75 "\u06AF" { return "G"; } /* GAF from AraMorph */
76 "\u0698" { return "R"; } /* JEH from AraMorph */
77 "\u061F" { return "?"; } /* QUESTION MARK from AraMorph */
78
79 "\u06DC" { return ":"; } /* SmallHighSeen */
80 "\u06DF" { return "@"; } /* SmallHighRoundedZero */
81
82 "\u06E2" { return "["; } /* SmallHighMeemIsolatedForm */
83 "\u06E3" { return ";"; } /* SmallLowSeen */
84 "\u06E5" { return ","; } /* SmallWaw */
85 "\u06E6" { return "."; } /* SmallYa */
86 "\u06E8" { return "!"; } /* SmallHighNoon */
87 "\u06EA" { return "-"; } /* EmptyCentreLowStop */
88 "\u06EB" { return "+"; } /* EmptyCentreHighStop */
89 "\u06EC" { return "%"; } /* RoundedHighStopWithFilledCentre */
90 "\u06ED" { return "]"; } /* SmallLowMeem */
91
92 [\&_]"vert;" { return "|"; }
93 [\&_]"lpar;" { return "("; }
94 [\&_]"rpar;" { return ")"; }
95 [\_\&]"lt;" { return "&lt;"; }
96 [\_\&]"gt;" { return "&gt;"; }
97 "&#039;" { return "'"; }
98
99 "&"[a-zA-Z]+";" { return yytext(); }
100
101 . { return yytext(); }
102 \n { return yytext(); }
103
104 /* make problemes */
105 /* "\u06E0" { return "\\""; } SmallHighUprightRectangularZero */
106
107
108 /* double entries */
109 /* "\u060C" { return ","; } COMMA from AraMorph */
110 /* "\u061B" { return ";"; } SEMICOLON from AraMorph */
111
112 /* not in buckwalter contained */
113 /* \u0679 : ARABIC LETTER TTEH */
114 /* \u0688 : ARABIC LETTER DDAL */
115 /* \u06A9 : ARABIC LETTER KEHEH */
116 /* \u0691 : ARABIC LETTER RREH */
117 /* \u06BA : ARABIC LETTER NOON GHUNNA */
118 /* \u06BE : ARABIC LETTER HEH DOACHASHMEE */
119 /* \u06C1 : ARABIC LETTER HEH GOAL */
120 /* \u06D2 : ARABIC LETTER YEH BARREE */
121