view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.general;

%%
%{
    /*
     * Betacode to Unicode conversion
     */

%}

%class Buckwalter2UnicodeLex
%public
%type java.lang.String
%unicode
%%


"<"[^>]+">" { return yytext(); }

"'"   { return "\u0621"; }  /* Hamza  */
"|"   { return "\u0622"; }  /* ALEF WITH MADDA ABOVE  from AraMorph */
">"   { return "\u0623"; }  /* Hamza  */
"&"   { return "\u0624"; }  /* Hamza  */
"<"   { return "\u0625"; }  /* Alif + HamzaBelow  */
"}"   { return "\u0626"; }  /* Ya + HamzaAbove  */
"A"   { return "\u0627"; }  /* Alif  */
"b"   { return "\u0628"; }  /* Ba  */
"p"   { return "\u0629"; }  /* TaMarbuta  */
"t"   { return "\u062A"; }  /* Ta  */
"v"   { return "\u062B"; }  /* Tha  */
"j"   { return "\u062C"; }  /* Jeem  */
"H"   { return "\u062D"; }  /* HHa  */
"x"   { return "\u062E"; }  /* Kha  */
"d"   { return "\u062F"; }  /* Dal  */
"*"   { return "\u0630"; }  /* Thal  */
"r"   { return "\u0631"; }  /* Ra  */
"z"   { return "\u0632"; }  /* Zain  */
"s"   { return "\u0633"; }  /* Seen  */
"$"   { return "\u0634"; }  /* Sheen  */
"S"   { return "\u0635"; }  /* Sad  */
"D"   { return "\u0636"; }  /* DDad  */
"T"   { return "\u0637"; }  /* TTa  */
"Z"   { return "\u0638"; }  /* DTha  */
"E"   { return "\u0639"; }  /* Ain  */
"g"   { return "\u063A"; }  /* Ghain  */

"_"   { return "\u0640"; }  /* Tatweel  */
"f"   { return "\u0641"; }  /* Fa  */
"q"   { return "\u0642"; }  /* Qaf  */
"k"   { return "\u0643"; }  /* Kaf  */
"l"   { return "\u0644"; }  /* Lam  */
"m"   { return "\u0645"; }  /* Meem  */
"n"   { return "\u0646"; }  /* Noon  */
"h"   { return "\u0647"; }  /* Ha  */
"w"   { return "\u0648"; }  /* Waw  */
"Y"   { return "\u0649"; }  /* AlifMaksura  */
"y"   { return "\u064A"; }  /* Ya  */
"F"   { return "\u064B"; }  /* Fathatan  */
"N"   { return "\u064C"; }  /* Dammatan  */
"K"   { return "\u064D"; }  /* Kasratan  */
"a"   { return "\u064E"; }  /* Fatha  */
"u"   { return "\u064F"; }  /* Damma  */
"i"   { return "\u0650"; }  /* Kasra  */
"~"   { return "\u0651"; }  /* Shadda  */
"o"   { return "\u0652"; }  /* Sukun  */
"^"   { return "\u0653"; }  /* Maddah  */
"#"   { return "\u0654"; }  /* HamzaAbove  */

"`"   { return "\u0670"; }  /* AlifKhanjareeya  */
"{"   { return "\u0671"; }  /* Alif + HamzatWasl  */

"P"   { return "\u067E"; }  /* PEH  from AraMorph   */
"J"   { return "\u0686"; }  /* TCHEH  from AraMorph */
"V"   { return "\u06A4"; }  /* VEH  from AraMorph */
"G"   { return "\u06AF"; }  /* GAF  from AraMorph */
"R"   { return "\u0698"; }  /* JEH  from AraMorph */
"?"   { return "\u061F"; }  /* QUESTION MARK  from AraMorph */

":"   { return "\u06DC"; }  /* SmallHighSeen  */
"@"   { return "\u06DF"; }  /* SmallHighRoundedZero  */

"["   { return "\u06E2"; }  /* SmallHighMeemIsolatedForm  */
";"   { return "\u06E3"; }  /* SmallLowSeen  */
","   { return "\u06E5"; }  /* SmallWaw  */
"."   { return "\u06E6"; }  /* SmallYa  */
"!"   { return "\u06E8"; }  /* SmallHighNoon  */
"-"   { return "\u06EA"; }  /* EmptyCentreLowStop  */
"+"   { return "\u06EB"; }  /* EmptyCentreHighStop  */
"%"   { return "\u06EC"; }  /* RoundedHighStopWithFilledCentre  */
"]"   { return "\u06ED"; }  /* SmallLowMeem  */

[\&_]"vert;"   { return "|"; }
[\&_]"lpar;"   { return "("; }
[\&_]"rpar;"   { return ")"; }
[\_\&]"lt;"    { return "&lt;"; }
[\_\&]"gt;"    { return "&gt;"; }
"&#039;"       { return "'"; } 

"&"[a-zA-Z]+";"  { return yytext(); }

.       { return yytext(); }
\n      { return yytext(); }

/* make problemes   */
/* "\\""   { return "\u06E0"; }  SmallHighUprightRectangularZero  */ 


/* double entries    */
/*  ","   { return "\u060C"; }  COMMA  from AraMorph */
/*  ";"   { return "\u061B"; }  SEMICOLON  from AraMorph */

/* not in buckwalter contained   */
/* \u0679 : ARABIC LETTER TTEH   */
/* \u0688 : ARABIC LETTER DDAL   */
/* \u06A9 : ARABIC LETTER KEHEH  */
/* \u0691 : ARABIC LETTER RREH   */
/* \u06BA : ARABIC LETTER NOON GHUNNA  */
/* \u06BE : ARABIC LETTER HEH DOACHASHMEE  */
/* \u06C1 : ARABIC LETTER HEH GOAL  */
/* \u06D2 : ARABIC LETTER YEH BARREE  */