diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex	Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,228 @@
+/*
+ * Normalization rules for Latin text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle 
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexLA
+%type java.lang.String
+%unicode
+
+// Latin: la, lat
+
+%states DISP, DICT, SEARCH
+%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH
+
+%{
+	private static final int CONS = 1;
+	private static final int VOWEL = 2;
+	private int cv = 0;  // consonant = 1, vowel = 2, everything else = 0
+	
+	private String original = "";
+	private String normalized = "";
+	private int problem = 0;
+	
+	private void add (String norm) {
+		original += yytext(); 
+		normalized += norm;
+	}
+
+	private static final String LB = "[\u002d\u00ad] ";
+%}
+
+Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc.
+Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+// y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); }
+
+LR = [lLrR]
+
+hyphen = [\u002d\u00ad]  // hyphen and soft hyphen
+LB = {hyphen} \u0020
+lb = ({hyphen} \u0020)?
+
+END = \n
+
+que = (que)?  // optional -que
+enclitic = (que | ve | ne)
+prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare
+
+%%
+
+
+// TEST, siehe Benedetti Seite 444
+𐆑 { add("X"); } // (U+10191; D800+DD91)
+
+
+<DISP, DICT, SEARCH, 
+RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> {
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ { cv = CONS; add("s"); }
+ß { cv = CONS; add("ss"); }
+[æę] { cv = VOWEL; add("ae"); }
+Æ { cv = VOWEL; add("AE"); }
+œ { cv = VOWEL; add("oe"); }
+
+// 1.2 character combinations
+ij { cv = VOWEL; add("ii"); }
+
+// 2. superfluous diacritics
+
+// 2.1 acute accent
+q́ue / {END} { add("que"); }  // G
+á / [mrst]? {enclitic} {END} { add("a"); }  // G
+é / [mrst]? {enclitic} {END} { add("e"); }  // G
+í / [mrst]? {enclitic} {END} { add("i"); }  // G
+ó / [mrst]? {enclitic} {END} { add("o"); }  // G
+ú / [mrst]? {enclitic} {END} { add("u"); }  // G
+
+úe / {END} { add("ve"); }  // W ??
+
+// 2.2 grave accent
+à / {que} {END} { add("a"); }  // W G
+àm / {que} {END} { add("am"); }  // W (G)
+às / {que} {END} { add("as"); }  // W (G) (-àsque will likely never occur)
+è / {que} {END} { add("e"); }  // W G
+ò / {que} {END} { add("o"); }  // W G
+òd / {que} {END} { add("od"); }  // W (G)
+ùm / {que} {END} { add("um"); }  // W (G)
+ùs / {que} {END} { add("us"); }  // W G
+
+ès / {que} {END} { add("es"); }  // (G)
+^ quì / {END} { add("qui"); }  // W ??
+^ Quì / {END} { add("Qui"); }  // W ??
+àc / {END} { add("ac"); }  // W ??
+èr / {END} { add("er"); }  // W ??
+èt / {END} { add("et"); }  // W ??
+ù / {END} { add("u"); }  // W ??
+ùl / {END} { add("ul"); }  // W ??
+
+// 2.3 circumflex accent
+^ hîc / {END} { add("hic"); }  // W G
+^ Hîc / {END} { add("Hic"); }  // W G
+^ ô / {END} { add("o"); }  // G
+â / {que} {END} { add("a"); }  // W G
+ûs / {END} { add("us"); }  // W G
+âr { add("ar"); }  // W (G) --> this is only a rough approximation!
+
+// 2.4 trema
+// 2.4.1 common cases
+aë { cv = VOWEL; add("ae"); }
+oë { cv = VOWEL; add("oe"); }
+// 2.4.2 rare cases
+oï { cv = VOWEL; add("oi"); }
+uï { cv = VOWEL; add("ui"); }
+// 2.4.3 extremely rare cases
+uü { cv = VOWEL; add("uu"); }
+
+
+// 3. rules for u and v
+
+// 3.1 rules for u --> v
+
+// peruenias --> pervenias, interuallum --> intervallum
+^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); }  // not cv = CONS !
+
+// uellet --> vellet
+^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
+
+// diuidatur --> dividatur
+// ut, volui: unchanged
+// no rule for veruina because we cannot distinguish it from volui
+[uU] / {Vowel} { 
+		switch(cv) {
+			case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+			default: cv = VOWEL; add(yytext()); break;
+		}
+	}
+
+// 3.2 rules for v --> u
+
+// qvam --> quam
+qv { cv = CONS; add("qu"); }  // the replaced v still counts as consonant
+Qv { cv = CONS; add("Qu"); }
+QV { cv = CONS; add("QU"); }
+
+// febrvarius --> februarius
+// curva: unchanged
+{LR} [vV] { 
+		switch(cv) {
+			case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+			default: cv = CONS; add(yytext()); break;
+		}
+	}
+
+// februarivs --> februarius
+v / {lb} {Cons} { cv = CONS; add("u"); }
+V / {lb} {Cons} { cv = CONS; add("U"); }
+
+// 3.3 override default rule for .
+
+{Vowel} { cv = VOWEL; add(yytext()); }
+{Cons} { cv = CONS; add(yytext()); }
+[yY] { cv = 0; add(yytext()); }
+
+@ { problem = 1; cv = 0; add(yytext()); }
+{LB} { add(yytext()); }
+. { problem = 1; cv = 0; add(yytext()); }  // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
+
+}
+
+
+<DISP, RENAISSANCE_DISP> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized;
+		}
+	}
+}
+
+<DICT,  RENAISSANCE_DICT> {
+
+{END} {
+		switch (problem) {
+			case 1: return "";
+			default: return normalized.replaceAll(LB, "");
+		}
+	}
+}
+
+<SEARCH, RENAISSANCE_SEARCH> {
+
+{END} {
+		switch (problem) {
+			case 1: return original;
+			default: return normalized.replaceAll(LB, "").toLowerCase();
+		}
+	}
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+
+TO DO:
+
+LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? 
+LA: Diakritika nochmal mit Paul durchgehen
+LA: Die Disambiguierungen durch die Diakritika fehlen noch.
+LA: ist J wirklich ein Problemfall?
+LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben?
+
+*/