comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.general;
2
3 %%
4 %{
5 /*
6 * Betacode to Unicode conversion
7 */
8
9 %}
10
11 %class Buckwalter2UnicodeLex
12 %public
13 %type java.lang.String
14 %unicode
15 %%
16
17
18 "<"[^>]+">" { return yytext(); }
19
20 "'" { return "\u0621"; } /* Hamza */
21 "|" { return "\u0622"; } /* ALEF WITH MADDA ABOVE from AraMorph */
22 ">" { return "\u0623"; } /* Hamza */
23 "&" { return "\u0624"; } /* Hamza */
24 "<" { return "\u0625"; } /* Alif + HamzaBelow */
25 "}" { return "\u0626"; } /* Ya + HamzaAbove */
26 "A" { return "\u0627"; } /* Alif */
27 "b" { return "\u0628"; } /* Ba */
28 "p" { return "\u0629"; } /* TaMarbuta */
29 "t" { return "\u062A"; } /* Ta */
30 "v" { return "\u062B"; } /* Tha */
31 "j" { return "\u062C"; } /* Jeem */
32 "H" { return "\u062D"; } /* HHa */
33 "x" { return "\u062E"; } /* Kha */
34 "d" { return "\u062F"; } /* Dal */
35 "*" { return "\u0630"; } /* Thal */
36 "r" { return "\u0631"; } /* Ra */
37 "z" { return "\u0632"; } /* Zain */
38 "s" { return "\u0633"; } /* Seen */
39 "$" { return "\u0634"; } /* Sheen */
40 "S" { return "\u0635"; } /* Sad */
41 "D" { return "\u0636"; } /* DDad */
42 "T" { return "\u0637"; } /* TTa */
43 "Z" { return "\u0638"; } /* DTha */
44 "E" { return "\u0639"; } /* Ain */
45 "g" { return "\u063A"; } /* Ghain */
46
47 "_" { return "\u0640"; } /* Tatweel */
48 "f" { return "\u0641"; } /* Fa */
49 "q" { return "\u0642"; } /* Qaf */
50 "k" { return "\u0643"; } /* Kaf */
51 "l" { return "\u0644"; } /* Lam */
52 "m" { return "\u0645"; } /* Meem */
53 "n" { return "\u0646"; } /* Noon */
54 "h" { return "\u0647"; } /* Ha */
55 "w" { return "\u0648"; } /* Waw */
56 "Y" { return "\u0649"; } /* AlifMaksura */
57 "y" { return "\u064A"; } /* Ya */
58 "F" { return "\u064B"; } /* Fathatan */
59 "N" { return "\u064C"; } /* Dammatan */
60 "K" { return "\u064D"; } /* Kasratan */
61 "a" { return "\u064E"; } /* Fatha */
62 "u" { return "\u064F"; } /* Damma */
63 "i" { return "\u0650"; } /* Kasra */
64 "~" { return "\u0651"; } /* Shadda */
65 "o" { return "\u0652"; } /* Sukun */
66 "^" { return "\u0653"; } /* Maddah */
67 "#" { return "\u0654"; } /* HamzaAbove */
68
69 "`" { return "\u0670"; } /* AlifKhanjareeya */
70 "{" { return "\u0671"; } /* Alif + HamzatWasl */
71
72 "P" { return "\u067E"; } /* PEH from AraMorph */
73 "J" { return "\u0686"; } /* TCHEH from AraMorph */
74 "V" { return "\u06A4"; } /* VEH from AraMorph */
75 "G" { return "\u06AF"; } /* GAF from AraMorph */
76 "R" { return "\u0698"; } /* JEH from AraMorph */
77 "?" { return "\u061F"; } /* QUESTION MARK from AraMorph */
78
79 ":" { return "\u06DC"; } /* SmallHighSeen */
80 "@" { return "\u06DF"; } /* SmallHighRoundedZero */
81
82 "[" { return "\u06E2"; } /* SmallHighMeemIsolatedForm */
83 ";" { return "\u06E3"; } /* SmallLowSeen */
84 "," { return "\u06E5"; } /* SmallWaw */
85 "." { return "\u06E6"; } /* SmallYa */
86 "!" { return "\u06E8"; } /* SmallHighNoon */
87 "-" { return "\u06EA"; } /* EmptyCentreLowStop */
88 "+" { return "\u06EB"; } /* EmptyCentreHighStop */
89 "%" { return "\u06EC"; } /* RoundedHighStopWithFilledCentre */
90 "]" { return "\u06ED"; } /* SmallLowMeem */
91
92 [\&_]"vert;" { return "|"; }
93 [\&_]"lpar;" { return "("; }
94 [\&_]"rpar;" { return ")"; }
95 [\_\&]"lt;" { return "&lt;"; }
96 [\_\&]"gt;" { return "&gt;"; }
97 "&#039;" { return "'"; }
98
99 "&"[a-zA-Z]+";" { return yytext(); }
100
101 . { return yytext(); }
102 \n { return yytext(); }
103
104 /* make problemes */
105 /* "\\"" { return "\u06E0"; } SmallHighUprightRectangularZero */
106
107
108 /* double entries */
109 /* "," { return "\u060C"; } COMMA from AraMorph */
110 /* ";" { return "\u061B"; } SEMICOLON from AraMorph */
111
112 /* not in buckwalter contained */
113 /* \u0679 : ARABIC LETTER TTEH */
114 /* \u0688 : ARABIC LETTER DDAL */
115 /* \u06A9 : ARABIC LETTER KEHEH */
116 /* \u0691 : ARABIC LETTER RREH */
117 /* \u06BA : ARABIC LETTER NOON GHUNNA */
118 /* \u06BE : ARABIC LETTER HEH DOACHASHMEE */
119 /* \u06C1 : ARABIC LETTER HEH GOAL */
120 /* \u06D2 : ARABIC LETTER YEH BARREE */
121