mpdl-group: software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex comparison

comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode

author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 08 Feb 2011 14:54:09 +0100
parents
children

comparison

equal deleted inserted replaced

-:94305c504178
+:2396a569e446
+/*
+* Normalization rules for all languages
+* [this is a JFlex specification]
+*
+* Wolfgang Schmidle
+* 2011-01-25
+*
+*/
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+%%
+%public
+%class MpdlNormalizerLexAll
+%type java.lang.String
+%unicode
+// %debug
+%states LA, ZH
+%{
+	int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+%}
+VOWEL=[AEIOUaeiouÆæęàèòùœ]
+CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR=[lLrR]
+QUE=(que)?
+END=\n
+%%
+<LA> {
+// 1. simple replacements
+// 1.1 single characters
+ſ				{ cv = 1; return "s"; }
+ß				{ cv = 1; return "ss"; }
+[æę]			{ cv = 2; return "ae"; }
+Æ				{ cv = 2; return "AE"; }
+œ				{ cv = 2; return "oe"; }
+// 1.2 character combinations
+ij				{ cv = 2; return "ii"; }
+// 2. diacritics
+// 2.1 superfluous diacritics in single words
+^ hîc {END} 			{ return "hic"; }
+// 2.2 superfluous diacritics at the end of a word
+// 2.2.1 common cases
+à / {QUE} {END}			{ return "a"; }
+àm / {QUE} {END}	{ return "am"; }
+às / {QUE} {END}		{ return "as"; }  // (-àsque will likely never occur)
+// à / [ms]? {QUE} {END}		{ return "a"; }
+è / {QUE} {END}			{ return "e"; }
+ò / {QUE} {END}			{ return "o"; }
+òd / {QUE} {END}		{ return "od"; }
+ùm / {QUE} {END}		{ return "um"; }
+ùs / {QUE} {END}		{ return "us"; }
+// 2.3 superfluous diacritics within a word
+// 2.3.1 common cases
+aë				{ cv = 2; return "ae"; }
+oë				{ cv = 2; return "oe"; }
+// 2.3.2 rare cases
+oï				{ cv = 2; return "oi"; }
+uï				{ cv = 2; return "ui"; }
+// 2.3.3 extremely rare cases
+uü			{ cv = 2; return "uu"; }
+// 3. rules for u and v
+// 3.1 rules for u
+u/{VOWEL} 		{
+								switch(cv) {
+									case 2: return "v";
+									default: cv = 2; return "u";
+								}
+							}
+U/{VOWEL}		{
+								switch(cv) {
+									case 2: return "V";
+									default: cv = 2; return "U";
+								}
+							}
+// 3.2 rules for v
+qv			{ cv = 1; return "qu"; }  // the replaced v still counts as consonant
+Qv		{ cv = 1; return "Qu"; }
+QV		{ cv = 1; return "QU"; }
+{LR}v					{
+								switch(cv) {
+									case 1: return yytext().replace("v", "u");
+									default: cv = 1; return yytext();
+								}
+							}
+{LR}V					{
+								switch(cv) {
+									case 1: return yytext().replace("V", "U");
+									default: cv = 1; return yytext();
+								}
+							}
+v/{CONS}			{ cv = 1; return "u"; }
+V/{CONS}			{ cv = 1; return "U"; }
+// default
+{VOWEL}		{ cv = 2; return yytext(); }
+{CONS}			{ cv = 1; return yytext(); }
+\n					{ cv = 0; return ""; }
+.					{ cv = 0; return yytext(); }
+}
+<ZH> {
+// Codepoint < FFFF
+竒	{ return "奇"; }  // 7AD2 --> 5947
+旹	{ return "時"; }  // 65F9 --> 6642
+歴	{ return "歷"; }  // 6B74 --> 6B77
+精	{ return "精"; }  // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+// Codepoint > FFFF
+庶	{ return "庶"; }  // 2F88D --> 5EB6  (2F88D is a compatibility ideograph)
+}
+// default (can be overridden by individual languages)
+\n					{ return ""; }
+.					{ return yytext(); }

Mercurial > hg > mpdl-group

comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex @ 6:2396a569e446