comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 /*
2 * Normalization rules for Latin text
3 * [this is a JFlex specification]
4 *
5 * Wolfgang Schmidle
6 * version 2011-07-12
7 *
8 */
9
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
11
12 %%
13
14 %public
15 %class MpdlNormalizerLexLA
16 %type java.lang.String
17 %unicode
18
19 // Latin: la, lat
20
21 %states DISP, DICT, SEARCH
22 %states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH
23
24 %{
25 private static final int CONS = 1;
26 private static final int VOWEL = 2;
27 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
28
29 private String original = "";
30 private String normalized = "";
31 private int problem = 0;
32
33 private void add (String norm) {
34 original += yytext();
35 normalized += norm;
36 }
37
38 private static final String LB = "[\u002d\u00ad] ";
39 %}
40
41 Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc.
42 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
43 // y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); }
44
45 LR = [lLrR]
46
47 hyphen = [\u002d\u00ad] // hyphen and soft hyphen
48 LB = {hyphen} \u0020
49 lb = ({hyphen} \u0020)?
50
51 END = \n
52
53 que = (que)? // optional -que
54 enclitic = (que | ve | ne)
55 prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare
56
57 %%
58
59
60 // jump over empty xml elements
61 "<"[^><]+"/>" { cv = 0; add(yytext()); }
62 "-<"[^><]+"/>" { cv = 0; add(yytext()); }
63 "<"[^><]+"></"[^><]+">" { cv = 0; add(yytext()); }
64 "-<"[^><]+"></"[^><]+">" { cv = 0; add(yytext()); }
65
66 // TEST, siehe Benedetti Seite 444
67 𐆑 { add("X"); } // (U+10191; D800+DD91)
68
69
70 <DISP, DICT, SEARCH,
71 RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> {
72
73 // 1. simple replacements
74
75 // 1.1 single characters
76 ſ { cv = CONS; add("s"); }
77 ß { cv = CONS; add("ss"); }
78 [æę] { cv = VOWEL; add("ae"); }
79 Æ { cv = VOWEL; add("AE"); }
80 œ { cv = VOWEL; add("oe"); }
81
82 // 1.2 character combinations
83 ij { cv = VOWEL; add("ii"); }
84
85 // 2. superfluous diacritics
86
87 // 2.1 acute accent
88 q́ue / {END} { add("que"); } // G
89 á / [mrst]? {enclitic} {END} { add("a"); } // G
90 é / [mrst]? {enclitic} {END} { add("e"); } // G
91 í / [mrst]? {enclitic} {END} { add("i"); } // G
92 ó / [mrst]? {enclitic} {END} { add("o"); } // G
93 ú / [mrst]? {enclitic} {END} { add("u"); } // G
94
95 úe / {END} { add("ve"); } // W ??
96
97 // 2.2 grave accent
98 à / {que} {END} { add("a"); } // W G
99 àm / {que} {END} { add("am"); } // W (G)
100 às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur)
101 è / {que} {END} { add("e"); } // W G
102 ò / {que} {END} { add("o"); } // W G
103 òd / {que} {END} { add("od"); } // W (G)
104 ùm / {que} {END} { add("um"); } // W (G)
105 ùs / {que} {END} { add("us"); } // W G
106
107 ès / {que} {END} { add("es"); } // (G)
108 ^ quì / {END} { add("qui"); } // W ??
109 ^ Quì / {END} { add("Qui"); } // W ??
110 àc / {END} { add("ac"); } // W ??
111 èr / {END} { add("er"); } // W ??
112 èt / {END} { add("et"); } // W ??
113 ù / {END} { add("u"); } // W ??
114 ùl / {END} { add("ul"); } // W ??
115
116 // 2.3 circumflex accent
117 ^ hîc / {END} { add("hic"); } // W G
118 ^ Hîc / {END} { add("Hic"); } // W G
119 ^ ô / {END} { add("o"); } // G
120 â / {que} {END} { add("a"); } // W G
121 ûs / {END} { add("us"); } // W G
122 âr { add("ar"); } // W (G) --> this is only a rough approximation!
123
124 // 2.4 trema
125 // 2.4.1 common cases
126 aë { cv = VOWEL; add("ae"); }
127 oë { cv = VOWEL; add("oe"); }
128 // 2.4.2 rare cases
129 oï { cv = VOWEL; add("oi"); }
130 uï { cv = VOWEL; add("ui"); }
131 // 2.4.3 extremely rare cases
132 uü { cv = VOWEL; add("uu"); }
133
134
135 // 3. rules for u and v
136
137 // 3.1 rules for u --> v
138
139 // peruenias --> pervenias, interuallum --> intervallum
140 ^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS !
141
142 // uellet --> vellet
143 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
144
145 // diuidatur --> dividatur
146 // ut, volui: unchanged
147 // no rule for veruina because we cannot distinguish it from volui
148 [uU] / {Vowel} {
149 switch(cv) {
150 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
151 default: cv = VOWEL; add(yytext()); break;
152 }
153 }
154
155 // 3.2 rules for v --> u
156
157 // qvam --> quam
158 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant
159 Qv { cv = CONS; add("Qu"); }
160 QV { cv = CONS; add("QU"); }
161
162 // febrvarius --> februarius
163 // curva: unchanged
164 {LR} [vV] {
165 switch(cv) {
166 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
167 default: cv = CONS; add(yytext()); break;
168 }
169 }
170
171 // februarivs --> februarius
172 v / {lb} {Cons} { cv = CONS; add("u"); }
173 V / {lb} {Cons} { cv = CONS; add("U"); }
174
175 // 3.3 override default rule for .
176
177 {Vowel} { cv = VOWEL; add(yytext()); }
178 {Cons} { cv = CONS; add(yytext()); }
179 [yY] { cv = 0; add(yytext()); }
180
181 @ { problem = 1; cv = 0; add(yytext()); }
182 {LB} { add(yytext()); }
183 . { problem = 1; cv = 0; add(yytext()); } // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
184
185 }
186
187
188 <DISP, RENAISSANCE_DISP> {
189
190 {END} {
191 switch (problem) {
192 case 1: return original;
193 default: return normalized;
194 }
195 }
196 }
197
198 <DICT, RENAISSANCE_DICT> {
199
200 {END} {
201 switch (problem) {
202 case 1: return "";
203 default: return normalized.replaceAll(LB, "");
204 }
205 }
206 }
207
208 <SEARCH, RENAISSANCE_SEARCH> {
209
210 {END} {
211 switch (problem) {
212 case 1: return original;
213 default: return normalized.replaceAll(LB, "").toLowerCase();
214 }
215 }
216 }
217
218
219 /*
220
221 Annahmen:
222 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
223 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
224
225
226 TO DO:
227
228 LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ?
229 LA: Diakritika nochmal mit Paul durchgehen
230 LA: Die Disambiguierungen durch die Diakritika fehlen noch.
231 LA: ist J wirklich ein Problemfall?
232 LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben?
233
234 */