diff software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex	Tue Nov 27 12:35:19 2012 +0100
@@ -0,0 +1,121 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.general;
+
+%%
+%{
+    /*
+     * Betacode to Unicode conversion
+     */
+
+%}
+
+%class Buckwalter2UnicodeLex
+%public
+%type java.lang.String
+%unicode
+%%
+
+
+"<"[^>]+">" { return yytext(); }
+
+"'"   { return "\u0621"; }  /* Hamza  */
+"|"   { return "\u0622"; }  /* ALEF WITH MADDA ABOVE  from AraMorph */
+">"   { return "\u0623"; }  /* Hamza  */
+"&"   { return "\u0624"; }  /* Hamza  */
+"<"   { return "\u0625"; }  /* Alif + HamzaBelow  */
+"}"   { return "\u0626"; }  /* Ya + HamzaAbove  */
+"A"   { return "\u0627"; }  /* Alif  */
+"b"   { return "\u0628"; }  /* Ba  */
+"p"   { return "\u0629"; }  /* TaMarbuta  */
+"t"   { return "\u062A"; }  /* Ta  */
+"v"   { return "\u062B"; }  /* Tha  */
+"j"   { return "\u062C"; }  /* Jeem  */
+"H"   { return "\u062D"; }  /* HHa  */
+"x"   { return "\u062E"; }  /* Kha  */
+"d"   { return "\u062F"; }  /* Dal  */
+"*"   { return "\u0630"; }  /* Thal  */
+"r"   { return "\u0631"; }  /* Ra  */
+"z"   { return "\u0632"; }  /* Zain  */
+"s"   { return "\u0633"; }  /* Seen  */
+"$"   { return "\u0634"; }  /* Sheen  */
+"S"   { return "\u0635"; }  /* Sad  */
+"D"   { return "\u0636"; }  /* DDad  */
+"T"   { return "\u0637"; }  /* TTa  */
+"Z"   { return "\u0638"; }  /* DTha  */
+"E"   { return "\u0639"; }  /* Ain  */
+"g"   { return "\u063A"; }  /* Ghain  */
+
+"_"   { return "\u0640"; }  /* Tatweel  */
+"f"   { return "\u0641"; }  /* Fa  */
+"q"   { return "\u0642"; }  /* Qaf  */
+"k"   { return "\u0643"; }  /* Kaf  */
+"l"   { return "\u0644"; }  /* Lam  */
+"m"   { return "\u0645"; }  /* Meem  */
+"n"   { return "\u0646"; }  /* Noon  */
+"h"   { return "\u0647"; }  /* Ha  */
+"w"   { return "\u0648"; }  /* Waw  */
+"Y"   { return "\u0649"; }  /* AlifMaksura  */
+"y"   { return "\u064A"; }  /* Ya  */
+"F"   { return "\u064B"; }  /* Fathatan  */
+"N"   { return "\u064C"; }  /* Dammatan  */
+"K"   { return "\u064D"; }  /* Kasratan  */
+"a"   { return "\u064E"; }  /* Fatha  */
+"u"   { return "\u064F"; }  /* Damma  */
+"i"   { return "\u0650"; }  /* Kasra  */
+"~"   { return "\u0651"; }  /* Shadda  */
+"o"   { return "\u0652"; }  /* Sukun  */
+"^"   { return "\u0653"; }  /* Maddah  */
+"#"   { return "\u0654"; }  /* HamzaAbove  */
+
+"`"   { return "\u0670"; }  /* AlifKhanjareeya  */
+"{"   { return "\u0671"; }  /* Alif + HamzatWasl  */
+
+"P"   { return "\u067E"; }  /* PEH  from AraMorph   */
+"J"   { return "\u0686"; }  /* TCHEH  from AraMorph */
+"V"   { return "\u06A4"; }  /* VEH  from AraMorph */
+"G"   { return "\u06AF"; }  /* GAF  from AraMorph */
+"R"   { return "\u0698"; }  /* JEH  from AraMorph */
+"?"   { return "\u061F"; }  /* QUESTION MARK  from AraMorph */
+
+":"   { return "\u06DC"; }  /* SmallHighSeen  */
+"@"   { return "\u06DF"; }  /* SmallHighRoundedZero  */
+
+"["   { return "\u06E2"; }  /* SmallHighMeemIsolatedForm  */
+";"   { return "\u06E3"; }  /* SmallLowSeen  */
+","   { return "\u06E5"; }  /* SmallWaw  */
+"."   { return "\u06E6"; }  /* SmallYa  */
+"!"   { return "\u06E8"; }  /* SmallHighNoon  */
+"-"   { return "\u06EA"; }  /* EmptyCentreLowStop  */
+"+"   { return "\u06EB"; }  /* EmptyCentreHighStop  */
+"%"   { return "\u06EC"; }  /* RoundedHighStopWithFilledCentre  */
+"]"   { return "\u06ED"; }  /* SmallLowMeem  */
+
+[\&_]"vert;"   { return "|"; }
+[\&_]"lpar;"   { return "("; }
+[\&_]"rpar;"   { return ")"; }
+[\_\&]"lt;"    { return "&lt;"; }
+[\_\&]"gt;"    { return "&gt;"; }
+"&#039;"       { return "'"; } 
+
+"&"[a-zA-Z]+";"  { return yytext(); }
+
+.       { return yytext(); }
+\n      { return yytext(); }
+
+/* make problemes   */
+/* "\\""   { return "\u06E0"; }  SmallHighUprightRectangularZero  */ 
+
+
+/* double entries    */
+/*  ","   { return "\u060C"; }  COMMA  from AraMorph */
+/*  ";"   { return "\u061B"; }  SEMICOLON  from AraMorph */
+
+/* not in buckwalter contained   */
+/* \u0679 : ARABIC LETTER TTEH   */
+/* \u0688 : ARABIC LETTER DDAL   */
+/* \u06A9 : ARABIC LETTER KEHEH  */
+/* \u0691 : ARABIC LETTER RREH   */
+/* \u06BA : ARABIC LETTER NOON GHUNNA  */
+/* \u06BE : ARABIC LETTER HEH DOACHASHMEE  */
+/* \u06C1 : ARABIC LETTER HEH GOAL  */
+/* \u06D2 : ARABIC LETTER YEH BARREE  */
+