Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex Tue Nov 27 12:35:19 2012 +0100 @@ -0,0 +1,143 @@ +/* + * Normalization rules for all languages + * [this is a JFlex specification] + * + * Wolfgang Schmidle + * 2011-01-25 + * + */ + +package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; + +%% + +%public +%class MpdlNormalizerLexAll +%type java.lang.String +%unicode +// %debug + +%states LA, ZH + +%{ + int cv = 0; // consonant = 1, vowel = 2, everything else = 0 +%} + +VOWEL=[AEIOUaeiouÆæęàèòùœ] +CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] +LR=[lLrR] +QUE=(que)? +END=\n + +%% + +<LA> { + +// 1. simple replacements + +// 1.1 single characters +ſ { cv = 1; return "s"; } +ß { cv = 1; return "ss"; } +[æę] { cv = 2; return "ae"; } +Æ { cv = 2; return "AE"; } +œ { cv = 2; return "oe"; } +// 1.2 character combinations +ij { cv = 2; return "ii"; } + +// 2. diacritics + +// 2.1 superfluous diacritics in single words +^ hîc {END} { return "hic"; } + +// 2.2 superfluous diacritics at the end of a word +// 2.2.1 common cases +à / {QUE} {END} { return "a"; } +àm / {QUE} {END} { return "am"; } +às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) +// à / [ms]? {QUE} {END} { return "a"; } +è / {QUE} {END} { return "e"; } +ò / {QUE} {END} { return "o"; } +òd / {QUE} {END} { return "od"; } +ùm / {QUE} {END} { return "um"; } +ùs / {QUE} {END} { return "us"; } + +// 2.3 superfluous diacritics within a word +// 2.3.1 common cases +aë { cv = 2; return "ae"; } +oë { cv = 2; return "oe"; } +// 2.3.2 rare cases +oï { cv = 2; return "oi"; } +uï { cv = 2; return "ui"; } +// 2.3.3 extremely rare cases +uü { cv = 2; return "uu"; } + +// 3. rules for u and v + +// 3.1 rules for u + +u/{VOWEL} { + switch(cv) { + case 2: return "v"; + default: cv = 2; return "u"; + } + } +U/{VOWEL} { + switch(cv) { + case 2: return "V"; + default: cv = 2; return "U"; + } + } + +// 3.2 rules for v + +qv { cv = 1; return "qu"; } // the replaced v still counts as consonant +Qv { cv = 1; return "Qu"; } +QV { cv = 1; return "QU"; } + +{LR}v { + switch(cv) { + case 1: return yytext().replace("v", "u"); + default: cv = 1; return yytext(); + } + } +{LR}V { + switch(cv) { + case 1: return yytext().replace("V", "U"); + default: cv = 1; return yytext(); + } + } + +v/{CONS} { cv = 1; return "u"; } +V/{CONS} { cv = 1; return "U"; } + + +// default + +{VOWEL} { cv = 2; return yytext(); } +{CONS} { cv = 1; return yytext(); } +\n { cv = 0; return ""; } +. { cv = 0; return yytext(); } + +} + +<ZH> { + +// Codepoint < FFFF + +竒 { return "奇"; } // 7AD2 --> 5947 +旹 { return "時"; } // 65F9 --> 6642 +歴 { return "歷"; } // 6B74 --> 6B77 +精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) + +// Codepoint > FFFF + +庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) + + +} + + +// default (can be overridden by individual languages) + +\n { return ""; } +. { return yytext(); }