view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/general/Unicode2Buckwalter.lex @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.general;

%%
%{
    /*
     * Betacode to Unicode conversion
     */

%}

%class Unicode2BuckwalterLex
%public
%type java.lang.String
%unicode
%%


"<"[^>]+">" { return yytext(); }

"\u0621"   { return "'"; }  /* Hamza  */
"\u0622"   { return "|"; }  /* ALEF WITH MADDA ABOVE  from AraMorph */
"\u0623"   { return ">"; }  /* Hamza  */
"\u0624"   { return "&"; }  /* Hamza  */
"\u0625"   { return "<"; }  /* Alif + HamzaBelow  */
"\u0626"   { return "}"; }  /* Ya + HamzaAbove  */
"\u0627"   { return "A"; }  /* Alif  */
"\u0628"   { return "b"; }  /* Ba  */
"\u0629"   { return "p"; }  /* TaMarbuta  */
"\u062A"   { return "t"; }  /* Ta  */
"\u062B"   { return "v"; }  /* Tha  */
"\u062C"   { return "j"; }  /* Jeem  */
"\u062D"   { return "H"; }  /* HHa  */
"\u062E"   { return "x"; }  /* Kha  */
"\u062F"   { return "d"; }  /* Dal  */
"\u0630"   { return "*"; }  /* Thal  */
"\u0631"   { return "r"; }  /* Ra  */
"\u0632"   { return "z"; }  /* Zain  */
"\u0633"   { return "s"; }  /* Seen  */
"\u0634"   { return "$"; }  /* Sheen  */
"\u0635"   { return "S"; }  /* Sad  */
"\u0636"   { return "D"; }  /* DDad  */
"\u0637"   { return "T"; }  /* TTa  */
"\u0638"   { return "Z"; }  /* DTha  */
"\u0639"   { return "E"; }  /* Ain  */
"\u063A"   { return "g"; }  /* Ghain  */

"\u0640"   { return "_"; }  /* Tatweel  */
"\u0641"   { return "f"; }  /* Fa  */
"\u0642"   { return "q"; }  /* Qaf  */
"\u0643"   { return "k"; }  /* Kaf  */
"\u0644"   { return "l"; }  /* Lam  */
"\u0645"   { return "m"; }  /* Meem  */
"\u0646"   { return "n"; }  /* Noon  */
"\u0647"   { return "h"; }  /* Ha  */
"\u0648"   { return "w"; }  /* Waw  */
"\u0649"   { return "Y"; }  /* AlifMaksura  */
"\u064A"   { return "y"; }  /* Ya  */
"\u064B"   { return "F"; }  /* Fathatan  */
"\u064C"   { return "N"; }  /* Dammatan  */
"\u064D"   { return "K"; }  /* Kasratan  */
"\u064E"   { return "a"; }  /* Fatha  */
"\u064F"   { return "u"; }  /* Damma  */
"\u0650"   { return "i"; }  /* Kasra  */
"\u0651"   { return "~"; }  /* Shadda  */
"\u0652"   { return "o"; }  /* Sukun  */
"\u0653"   { return "^"; }  /* Maddah  */
"\u0654"   { return "#"; }  /* HamzaAbove  */

"\u0670"   { return "`"; }  /* AlifKhanjareeya  */
"\u0671"   { return "{"; }  /* Alif + HamzatWasl  */

"\u067E"   { return "P"; }  /* PEH  from AraMorph   */
"\u0686"   { return "J"; }  /* TCHEH  from AraMorph */
"\u06A4"   { return "V"; }  /* VEH  from AraMorph */
"\u06AF"   { return "G"; }  /* GAF  from AraMorph */
"\u0698"   { return "R"; }  /* JEH  from AraMorph */
"\u061F"   { return "?"; }  /* QUESTION MARK  from AraMorph */

"\u06DC"   { return ":"; }  /* SmallHighSeen  */
"\u06DF"   { return "@"; }  /* SmallHighRoundedZero  */

"\u06E2"   { return "["; }  /* SmallHighMeemIsolatedForm  */
"\u06E3"   { return ";"; }  /* SmallLowSeen  */
"\u06E5"   { return ","; }  /* SmallWaw  */
"\u06E6"   { return "."; }  /* SmallYa  */
"\u06E8"   { return "!"; }  /* SmallHighNoon  */
"\u06EA"   { return "-"; }  /* EmptyCentreLowStop  */
"\u06EB"   { return "+"; }  /* EmptyCentreHighStop  */
"\u06EC"   { return "%"; }  /* RoundedHighStopWithFilledCentre  */
"\u06ED"   { return "]"; }  /* SmallLowMeem  */

[\&_]"vert;"   { return "|"; }
[\&_]"lpar;"   { return "("; }
[\&_]"rpar;"   { return ")"; }
[\_\&]"lt;"    { return "&lt;"; }
[\_\&]"gt;"    { return "&gt;"; }
"&#039;"       { return "'"; } 

"&"[a-zA-Z]+";"  { return yytext(); }

.       { return yytext(); }
\n      { return yytext(); }

/* make problemes   */
/* "\u06E0"   { return "\\""; }  SmallHighUprightRectangularZero  */ 


/* double entries    */
/*  "\u060C"   { return ","; }  COMMA  from AraMorph */
/*  "\u061B"   { return ";"; }  SEMICOLON  from AraMorph */

/* not in buckwalter contained   */
/* \u0679 : ARABIC LETTER TTEH   */
/* \u0688 : ARABIC LETTER DDAL   */
/* \u06A9 : ARABIC LETTER KEHEH  */
/* \u0691 : ARABIC LETTER RREH   */
/* \u06BA : ARABIC LETTER NOON GHUNNA  */
/* \u06BE : ARABIC LETTER HEH DOACHASHMEE  */
/* \u06C1 : ARABIC LETTER HEH GOAL  */
/* \u06D2 : ARABIC LETTER YEH BARREE  */