diff software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex	Tue Nov 27 12:35:19 2012 +0100
@@ -0,0 +1,143 @@
+/*
+ * Normalization rules for all languages
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle 
+ * 2011-01-25
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAll
+%type java.lang.String
+%unicode
+// %debug
+
+%states LA, ZH
+
+%{
+	int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+%}
+
+VOWEL=[AEIOUaeiouÆæęàèòùœ]
+CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR=[lLrR]
+QUE=(que)?
+END=\n
+
+%%
+
+<LA> { 
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ				{ cv = 1; return "s"; }
+ß				{ cv = 1; return "ss"; }
+[æę]			{ cv = 2; return "ae"; }
+Æ				{ cv = 2; return "AE"; }
+œ				{ cv = 2; return "oe"; }
+// 1.2 character combinations
+ij				{ cv = 2; return "ii"; }
+
+// 2. diacritics
+
+// 2.1 superfluous diacritics in single words
+^ hîc {END} 			{ return "hic"; }
+
+// 2.2 superfluous diacritics at the end of a word
+// 2.2.1 common cases
+à / {QUE} {END}			{ return "a"; }
+àm / {QUE} {END}	{ return "am"; }
+às / {QUE} {END}		{ return "as"; }  // (-àsque will likely never occur)
+// à / [ms]? {QUE} {END}		{ return "a"; }
+è / {QUE} {END}			{ return "e"; }
+ò / {QUE} {END}			{ return "o"; }
+òd / {QUE} {END}		{ return "od"; }
+ùm / {QUE} {END}		{ return "um"; }
+ùs / {QUE} {END}		{ return "us"; }
+
+// 2.3 superfluous diacritics within a word
+// 2.3.1 common cases
+aë				{ cv = 2; return "ae"; }
+oë				{ cv = 2; return "oe"; }
+// 2.3.2 rare cases
+oï				{ cv = 2; return "oi"; }
+uï				{ cv = 2; return "ui"; }
+// 2.3.3 extremely rare cases
+uü			{ cv = 2; return "uu"; }
+
+// 3. rules for u and v
+
+// 3.1 rules for u
+
+u/{VOWEL} 		{ 
+								switch(cv) {
+									case 2: return "v";
+									default: cv = 2; return "u";
+								}
+							}
+U/{VOWEL}		{ 
+								switch(cv) {
+									case 2: return "V";
+									default: cv = 2; return "U";
+								}
+							}
+
+// 3.2 rules for v
+
+qv			{ cv = 1; return "qu"; }  // the replaced v still counts as consonant
+Qv		{ cv = 1; return "Qu"; }
+QV		{ cv = 1; return "QU"; }
+
+{LR}v					{ 
+								switch(cv) {
+									case 1: return yytext().replace("v", "u");
+									default: cv = 1; return yytext();
+								}
+							}
+{LR}V					{ 
+								switch(cv) {
+									case 1: return yytext().replace("V", "U");
+									default: cv = 1; return yytext();
+								}
+							}
+
+v/{CONS}			{ cv = 1; return "u"; }
+V/{CONS}			{ cv = 1; return "U"; }
+
+
+// default
+
+{VOWEL}		{ cv = 2; return yytext(); }
+{CONS}			{ cv = 1; return yytext(); }
+\n					{ cv = 0; return ""; }
+.					{ cv = 0; return yytext(); }
+
+}
+
+<ZH> {
+
+// Codepoint < FFFF
+
+竒	{ return "奇"; }  // 7AD2 --> 5947
+旹	{ return "時"; }  // 65F9 --> 6642
+歴	{ return "歷"; }  // 6B74 --> 6B77
+精	{ return "精"; }  // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+
+// Codepoint > FFFF
+
+庶	{ return "庶"; }  // 2F88D --> 5EB6  (2F88D is a compatibility ideograph) 
+
+
+}
+
+
+// default (can be overridden by individual languages)
+
+\n					{ return ""; }
+.					{ return yytext(); }