Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/transcode/Buckwalter2Unicode.lex Tue Nov 27 12:35:19 2012 +0100 @@ -0,0 +1,121 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.general; + +%% +%{ + /* + * Betacode to Unicode conversion + */ + +%} + +%class Buckwalter2UnicodeLex +%public +%type java.lang.String +%unicode +%% + + +"<"[^>]+">" { return yytext(); } + +"'" { return "\u0621"; } /* Hamza */ +"|" { return "\u0622"; } /* ALEF WITH MADDA ABOVE from AraMorph */ +">" { return "\u0623"; } /* Hamza */ +"&" { return "\u0624"; } /* Hamza */ +"<" { return "\u0625"; } /* Alif + HamzaBelow */ +"}" { return "\u0626"; } /* Ya + HamzaAbove */ +"A" { return "\u0627"; } /* Alif */ +"b" { return "\u0628"; } /* Ba */ +"p" { return "\u0629"; } /* TaMarbuta */ +"t" { return "\u062A"; } /* Ta */ +"v" { return "\u062B"; } /* Tha */ +"j" { return "\u062C"; } /* Jeem */ +"H" { return "\u062D"; } /* HHa */ +"x" { return "\u062E"; } /* Kha */ +"d" { return "\u062F"; } /* Dal */ +"*" { return "\u0630"; } /* Thal */ +"r" { return "\u0631"; } /* Ra */ +"z" { return "\u0632"; } /* Zain */ +"s" { return "\u0633"; } /* Seen */ +"$" { return "\u0634"; } /* Sheen */ +"S" { return "\u0635"; } /* Sad */ +"D" { return "\u0636"; } /* DDad */ +"T" { return "\u0637"; } /* TTa */ +"Z" { return "\u0638"; } /* DTha */ +"E" { return "\u0639"; } /* Ain */ +"g" { return "\u063A"; } /* Ghain */ + +"_" { return "\u0640"; } /* Tatweel */ +"f" { return "\u0641"; } /* Fa */ +"q" { return "\u0642"; } /* Qaf */ +"k" { return "\u0643"; } /* Kaf */ +"l" { return "\u0644"; } /* Lam */ +"m" { return "\u0645"; } /* Meem */ +"n" { return "\u0646"; } /* Noon */ +"h" { return "\u0647"; } /* Ha */ +"w" { return "\u0648"; } /* Waw */ +"Y" { return "\u0649"; } /* AlifMaksura */ +"y" { return "\u064A"; } /* Ya */ +"F" { return "\u064B"; } /* Fathatan */ +"N" { return "\u064C"; } /* Dammatan */ +"K" { return "\u064D"; } /* Kasratan */ +"a" { return "\u064E"; } /* Fatha */ +"u" { return "\u064F"; } /* Damma */ +"i" { return "\u0650"; } /* Kasra */ +"~" { return "\u0651"; } /* Shadda */ +"o" { return "\u0652"; } /* Sukun */ +"^" { return "\u0653"; } /* Maddah */ +"#" { return "\u0654"; } /* HamzaAbove */ + +"`" { return "\u0670"; } /* AlifKhanjareeya */ +"{" { return "\u0671"; } /* Alif + HamzatWasl */ + +"P" { return "\u067E"; } /* PEH from AraMorph */ +"J" { return "\u0686"; } /* TCHEH from AraMorph */ +"V" { return "\u06A4"; } /* VEH from AraMorph */ +"G" { return "\u06AF"; } /* GAF from AraMorph */ +"R" { return "\u0698"; } /* JEH from AraMorph */ +"?" { return "\u061F"; } /* QUESTION MARK from AraMorph */ + +":" { return "\u06DC"; } /* SmallHighSeen */ +"@" { return "\u06DF"; } /* SmallHighRoundedZero */ + +"[" { return "\u06E2"; } /* SmallHighMeemIsolatedForm */ +";" { return "\u06E3"; } /* SmallLowSeen */ +"," { return "\u06E5"; } /* SmallWaw */ +"." { return "\u06E6"; } /* SmallYa */ +"!" { return "\u06E8"; } /* SmallHighNoon */ +"-" { return "\u06EA"; } /* EmptyCentreLowStop */ +"+" { return "\u06EB"; } /* EmptyCentreHighStop */ +"%" { return "\u06EC"; } /* RoundedHighStopWithFilledCentre */ +"]" { return "\u06ED"; } /* SmallLowMeem */ + +[\&_]"vert;" { return "|"; } +[\&_]"lpar;" { return "("; } +[\&_]"rpar;" { return ")"; } +[\_\&]"lt;" { return "<"; } +[\_\&]"gt;" { return ">"; } +"'" { return "'"; } + +"&"[a-zA-Z]+";" { return yytext(); } + +. { return yytext(); } +\n { return yytext(); } + +/* make problemes */ +/* "\\"" { return "\u06E0"; } SmallHighUprightRectangularZero */ + + +/* double entries */ +/* "," { return "\u060C"; } COMMA from AraMorph */ +/* ";" { return "\u061B"; } SEMICOLON from AraMorph */ + +/* not in buckwalter contained */ +/* \u0679 : ARABIC LETTER TTEH */ +/* \u0688 : ARABIC LETTER DDAL */ +/* \u06A9 : ARABIC LETTER KEHEH */ +/* \u0691 : ARABIC LETTER RREH */ +/* \u06BA : ARABIC LETTER NOON GHUNNA */ +/* \u06BE : ARABIC LETTER HEH DOACHASHMEE */ +/* \u06C1 : ARABIC LETTER HEH GOAL */ +/* \u06D2 : ARABIC LETTER YEH BARREE */ +