view software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line source

/*
 * Normalization rules for all languages
 * [this is a JFlex specification]
 *
 * Wolfgang Schmidle 
 * 2011-01-25
 *
 */

package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;

%%

%public
%class MpdlNormalizerLexAll
%type java.lang.String
%unicode
// %debug

%states LA, ZH

%{
	int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
%}

VOWEL=[AEIOUaeiouÆæęàèòùœ]
CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
LR=[lLrR]
QUE=(que)?
END=\n

%%

<LA> { 

// 1. simple replacements

// 1.1 single characters
ſ				{ cv = 1; return "s"; }
ß				{ cv = 1; return "ss"; }
[æę]			{ cv = 2; return "ae"; }
Æ				{ cv = 2; return "AE"; }
œ				{ cv = 2; return "oe"; }
// 1.2 character combinations
ij				{ cv = 2; return "ii"; }

// 2. diacritics

// 2.1 superfluous diacritics in single words
^ hîc {END} 			{ return "hic"; }

// 2.2 superfluous diacritics at the end of a word
// 2.2.1 common cases
à / {QUE} {END}			{ return "a"; }
àm / {QUE} {END}	{ return "am"; }
às / {QUE} {END}		{ return "as"; }  // (-àsque will likely never occur)
// à / [ms]? {QUE} {END}		{ return "a"; }
è / {QUE} {END}			{ return "e"; }
ò / {QUE} {END}			{ return "o"; }
òd / {QUE} {END}		{ return "od"; }
ùm / {QUE} {END}		{ return "um"; }
ùs / {QUE} {END}		{ return "us"; }

// 2.3 superfluous diacritics within a word
// 2.3.1 common cases
aë				{ cv = 2; return "ae"; }
oë				{ cv = 2; return "oe"; }
// 2.3.2 rare cases
oï				{ cv = 2; return "oi"; }
uï				{ cv = 2; return "ui"; }
// 2.3.3 extremely rare cases
uü			{ cv = 2; return "uu"; }

// 3. rules for u and v

// 3.1 rules for u

u/{VOWEL} 		{ 
								switch(cv) {
									case 2: return "v";
									default: cv = 2; return "u";
								}
							}
U/{VOWEL}		{ 
								switch(cv) {
									case 2: return "V";
									default: cv = 2; return "U";
								}
							}

// 3.2 rules for v

qv			{ cv = 1; return "qu"; }  // the replaced v still counts as consonant
Qv		{ cv = 1; return "Qu"; }
QV		{ cv = 1; return "QU"; }

{LR}v					{ 
								switch(cv) {
									case 1: return yytext().replace("v", "u");
									default: cv = 1; return yytext();
								}
							}
{LR}V					{ 
								switch(cv) {
									case 1: return yytext().replace("V", "U");
									default: cv = 1; return yytext();
								}
							}

v/{CONS}			{ cv = 1; return "u"; }
V/{CONS}			{ cv = 1; return "U"; }


// default

{VOWEL}		{ cv = 2; return yytext(); }
{CONS}			{ cv = 1; return yytext(); }
\n					{ cv = 0; return ""; }
.					{ cv = 0; return yytext(); }

}

<ZH> {

// Codepoint < FFFF

竒	{ return "奇"; }  // 7AD2 --> 5947
旹	{ return "時"; }  // 65F9 --> 6642
歴	{ return "歷"; }  // 6B74 --> 6B77
精	{ return "精"; }  // FA1D --> 7CBE (FA1D is a compatibility ideograph)

// Codepoint > FFFF

庶	{ return "庶"; }  // 2F88D --> 5EB6  (2F88D is a compatibility ideograph) 


}


// default (can be overridden by individual languages)

\n					{ return ""; }
.					{ return yytext(); }