comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex @ 9:1ec29fdd0db8

neue .lex Dateien f?r Normalisierung / externe Objekte update
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 22 Feb 2011 16:03:45 +0100
parents
children 5df60f24e997
comparison
equal deleted inserted replaced
8:d2a1c14fde31 9:1ec29fdd0db8
1 /*
2 * Normalization rules for Latin text
3 * [this is a JFlex specification]
4 *
5 * Wolfgang Schmidle
6 * version 0.96
7 * 2011-02-21
8 *
9 */
10
11 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
12
13 %%
14
15 %public
16 %class MpdlNormalizerLexLA
17 %type java.lang.String
18 %unicode
19
20 // Latin: la, lat
21
22 %states DISP, DICT, SEARCH
23 %states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH
24
25 %{
26 private static final int CONS = 1;
27 private static final int VOWEL = 2;
28 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
29
30 private String original = "";
31 private String normalized = "";
32 private int problem = 0;
33
34 private void add (String norm) {
35 original += yytext();
36 normalized += norm;
37 }
38 %}
39
40 Vowel = [AEIOUaeiou] // without Ææęàèòùœ
41 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
42 LR = [lLrR]
43
44 hyphen = [\u002d\u00ad] // hyphen and soft hyphen
45 X = {hyphen}?
46
47 END = \n
48
49 que = (que)? // optional -que
50 enclitic = (que | ve | ne)
51 prefixCons = (in{X}ter | per | ſu{X}per | ſer) // "ſer" for forms of ſervare
52
53 %%
54
55
56 // TEST, siehe Benedetti Seite 444
57 𐆑 { add("X"); } // (U+10191; D800+DD91)
58
59
60 <DISP, DICT, SEARCH,
61 RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> {
62
63 // 1. simple replacements
64
65 // 1.1 single characters
66 ſ { cv = CONS; add("s"); }
67 ß { cv = CONS; add("ss"); }
68 [æę] { cv = VOWEL; add("ae"); }
69 Æ { cv = VOWEL; add("AE"); }
70 œ { cv = VOWEL; add("oe"); }
71
72 // 1.2 character combinations
73 ij { cv = VOWEL; add("ii"); }
74
75 // 2. superfluous diacritics
76
77 // 2.1 acute accent
78 q́ue / {END} { add("que"); } // G
79 á / [mrst]? {enclitic} {END} { add("a"); } // G
80 é / [mrst]? {enclitic} {END} { add("e"); } // G
81 í / [mrst]? {enclitic} {END} { add("i"); } // G
82 ó / [mrst]? {enclitic} {END} { add("o"); } // G
83 ú / [mrst]? {enclitic} {END} { add("u"); } // G
84
85 úe / {END} { add("ve"); } // W ??
86
87 // 2.2 grave accent
88 à / {que} {END} { add("a"); } // W G
89 àm / {que} {END} { add("am"); } // W (G)
90 às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur)
91 è / {que} {END} { add("e"); } // W G
92 ò / {que} {END} { add("o"); } // W G
93 òd / {que} {END} { add("od"); } // W (G)
94 ùm / {que} {END} { add("um"); } // W (G)
95 ùs / {que} {END} { add("us"); } // W G
96
97 ès / {que} {END} { add("es"); } // (G)
98 ^ quì / {END} { add("qui"); } // W ??
99 ^ Quì / {END} { add("Qui"); } // W ??
100 àc / {END} { add("ac"); } // W ??
101 èr / {END} { add("er"); } // W ??
102 èt / {END} { add("et"); } // W ??
103 ù / {END} { add("u"); } // W ??
104 ùl / {END} { add("ul"); } // W ??
105
106 // 2.3 circumflex accent
107 ^ hîc / {END} { add("hic"); } // W G
108 ^ Hîc / {END} { add("Hic"); } // W G
109 ^ ô / {END} { add("o"); } // G
110 â / {que} {END} { add("a"); } // W G
111 ûs / {END} { add("us"); } // W G
112 âr { add("ar"); } // W (G) --> this is only a rough approximation!
113
114 // 2.4 trema
115 // 2.4.1 common cases
116 aë { cv = VOWEL; add("ae"); }
117 oë { cv = VOWEL; add("oe"); }
118 // 2.4.2 rare cases
119 oï { cv = VOWEL; add("oi"); }
120 uï { cv = VOWEL; add("ui"); }
121 // 2.4.3 extremely rare cases
122 uü { cv = VOWEL; add("uu"); }
123
124
125 // 3. rules for u and v
126
127 // 3.1 rules for u --> v
128
129 // peruenias --> pervenias, interuallum --> intervallum
130 ^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS !
131
132 // uellet --> vellet
133 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
134
135 // diuidatur --> dividatur
136 // ut, volui: unchanged
137 // no rule for veruina because we cannot distinguish it from volui
138 [uU] / {Vowel} {
139 switch(cv) {
140 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
141 default: cv = VOWEL; add(yytext()); break;
142 }
143 }
144
145 // 3.2 rules for v --> u
146
147 // qvam --> quam
148 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant
149 Qv { cv = CONS; add("Qu"); }
150 QV { cv = CONS; add("QU"); }
151
152 // febrvarius --> februarius
153 // curva: unchanged
154 {LR} [vV] {
155 switch(cv) {
156 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
157 default: cv = CONS; add(yytext()); break;
158 }
159 }
160
161 // februarivs --> februarius
162 v / {X} {Cons} { cv = CONS; add("u"); }
163 V / {X} {Cons} { cv = CONS; add("U"); }
164
165 // 3.3 override default rule for .
166
167 {Vowel} { cv = VOWEL; add(yytext()); }
168 {Cons} { cv = CONS; add(yytext()); }
169 {hyphen} { add(yytext()); }
170
171 . { problem = 1; cv = 0; add(yytext()); } // in particular "@", and from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
172
173 }
174
175
176 <DISP, SEARCH,
177 RENAISSANCE_DISP, RENAISSANCE_SEARCH> {
178
179 {END} {
180 switch (problem) {
181 case 1: return original;
182 default: return normalized;
183 }
184 }
185 }
186
187 <DICT,
188 RENAISSANCE_DICT> {
189
190 {END} {
191 switch (problem) {
192 case 1: return "";
193 default: return normalized;
194 }
195 }
196 }
197
198
199 /*
200
201 Annahmen:
202 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
203 - Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt
204
205
206 TO DO:
207
208 LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ?
209 LA: Diakritika nochmal mit Paul durchgehen
210 LA: Die Disambiguierungen durch die Diakritika fehlen noch.
211 LA: ist J wirklich ein Problemfall?
212 LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben?
213
214 */