Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children |
line wrap: on
line source
/* * Normalization rules for all languages * [this is a JFlex specification] * * Wolfgang Schmidle * 2011-01-25 * */ package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; %% %public %class MpdlNormalizerLexAll %type java.lang.String %unicode // %debug %states LA, ZH %{ int cv = 0; // consonant = 1, vowel = 2, everything else = 0 %} VOWEL=[AEIOUaeiouÆæęàèòùœ] CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] LR=[lLrR] QUE=(que)? END=\n %% <LA> { // 1. simple replacements // 1.1 single characters ſ { cv = 1; return "s"; } ß { cv = 1; return "ss"; } [æę] { cv = 2; return "ae"; } Æ { cv = 2; return "AE"; } œ { cv = 2; return "oe"; } // 1.2 character combinations ij { cv = 2; return "ii"; } // 2. diacritics // 2.1 superfluous diacritics in single words ^ hîc {END} { return "hic"; } // 2.2 superfluous diacritics at the end of a word // 2.2.1 common cases à / {QUE} {END} { return "a"; } àm / {QUE} {END} { return "am"; } às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) // à / [ms]? {QUE} {END} { return "a"; } è / {QUE} {END} { return "e"; } ò / {QUE} {END} { return "o"; } òd / {QUE} {END} { return "od"; } ùm / {QUE} {END} { return "um"; } ùs / {QUE} {END} { return "us"; } // 2.3 superfluous diacritics within a word // 2.3.1 common cases aë { cv = 2; return "ae"; } oë { cv = 2; return "oe"; } // 2.3.2 rare cases oï { cv = 2; return "oi"; } uï { cv = 2; return "ui"; } // 2.3.3 extremely rare cases uü { cv = 2; return "uu"; } // 3. rules for u and v // 3.1 rules for u u/{VOWEL} { switch(cv) { case 2: return "v"; default: cv = 2; return "u"; } } U/{VOWEL} { switch(cv) { case 2: return "V"; default: cv = 2; return "U"; } } // 3.2 rules for v qv { cv = 1; return "qu"; } // the replaced v still counts as consonant Qv { cv = 1; return "Qu"; } QV { cv = 1; return "QU"; } {LR}v { switch(cv) { case 1: return yytext().replace("v", "u"); default: cv = 1; return yytext(); } } {LR}V { switch(cv) { case 1: return yytext().replace("V", "U"); default: cv = 1; return yytext(); } } v/{CONS} { cv = 1; return "u"; } V/{CONS} { cv = 1; return "U"; } // default {VOWEL} { cv = 2; return yytext(); } {CONS} { cv = 1; return yytext(); } \n { cv = 0; return ""; } . { cv = 0; return yytext(); } } <ZH> { // Codepoint < FFFF 竒 { return "奇"; } // 7AD2 --> 5947 旹 { return "時"; } // 65F9 --> 6642 歴 { return "歷"; } // 6B74 --> 6B77 精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) // Codepoint > FFFF 庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) } // default (can be overridden by individual languages) \n { return ""; } . { return yytext(); }