Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* | |
2 * Normalization rules for Latin text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-07-12 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexLA | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // Latin: la, lat | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 %states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH | |
23 | |
24 %{ | |
25 private static final int CONS = 1; | |
26 private static final int VOWEL = 2; | |
27 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 | |
28 | |
29 private String original = ""; | |
30 private String normalized = ""; | |
31 private int problem = 0; | |
32 | |
33 private void add (String norm) { | |
34 original += yytext(); | |
35 normalized += norm; | |
36 } | |
37 | |
38 private static final String LB = "[\u002d\u00ad] "; | |
39 %} | |
40 | |
41 Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc. | |
42 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] | |
43 // y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); } | |
44 | |
45 LR = [lLrR] | |
46 | |
47 hyphen = [\u002d\u00ad] // hyphen and soft hyphen | |
48 LB = {hyphen} \u0020 | |
49 lb = ({hyphen} \u0020)? | |
50 | |
51 END = \n | |
52 | |
53 que = (que)? // optional -que | |
54 enclitic = (que | ve | ne) | |
55 prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare | |
56 | |
57 %% | |
58 | |
59 | |
60 // jump over empty xml elements | |
61 "<"[^><]+"/>" { cv = 0; add(yytext()); } | |
62 "-<"[^><]+"/>" { cv = 0; add(yytext()); } | |
63 "<"[^><]+"></"[^><]+">" { cv = 0; add(yytext()); } | |
64 "-<"[^><]+"></"[^><]+">" { cv = 0; add(yytext()); } | |
65 | |
66 // TEST, siehe Benedetti Seite 444 | |
67 𐆑 { add("X"); } // (U+10191; D800+DD91) | |
68 | |
69 | |
70 <DISP, DICT, SEARCH, | |
71 RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> { | |
72 | |
73 // 1. simple replacements | |
74 | |
75 // 1.1 single characters | |
76 ſ { cv = CONS; add("s"); } | |
77 ß { cv = CONS; add("ss"); } | |
78 [æę] { cv = VOWEL; add("ae"); } | |
79 Æ { cv = VOWEL; add("AE"); } | |
80 œ { cv = VOWEL; add("oe"); } | |
81 | |
82 // 1.2 character combinations | |
83 ij { cv = VOWEL; add("ii"); } | |
84 | |
85 // 2. superfluous diacritics | |
86 | |
87 // 2.1 acute accent | |
88 q́ue / {END} { add("que"); } // G | |
89 á / [mrst]? {enclitic} {END} { add("a"); } // G | |
90 é / [mrst]? {enclitic} {END} { add("e"); } // G | |
91 í / [mrst]? {enclitic} {END} { add("i"); } // G | |
92 ó / [mrst]? {enclitic} {END} { add("o"); } // G | |
93 ú / [mrst]? {enclitic} {END} { add("u"); } // G | |
94 | |
95 úe / {END} { add("ve"); } // W ?? | |
96 | |
97 // 2.2 grave accent | |
98 à / {que} {END} { add("a"); } // W G | |
99 àm / {que} {END} { add("am"); } // W (G) | |
100 às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur) | |
101 è / {que} {END} { add("e"); } // W G | |
102 ò / {que} {END} { add("o"); } // W G | |
103 òd / {que} {END} { add("od"); } // W (G) | |
104 ùm / {que} {END} { add("um"); } // W (G) | |
105 ùs / {que} {END} { add("us"); } // W G | |
106 | |
107 ès / {que} {END} { add("es"); } // (G) | |
108 ^ quì / {END} { add("qui"); } // W ?? | |
109 ^ Quì / {END} { add("Qui"); } // W ?? | |
110 àc / {END} { add("ac"); } // W ?? | |
111 èr / {END} { add("er"); } // W ?? | |
112 èt / {END} { add("et"); } // W ?? | |
113 ù / {END} { add("u"); } // W ?? | |
114 ùl / {END} { add("ul"); } // W ?? | |
115 | |
116 // 2.3 circumflex accent | |
117 ^ hîc / {END} { add("hic"); } // W G | |
118 ^ Hîc / {END} { add("Hic"); } // W G | |
119 ^ ô / {END} { add("o"); } // G | |
120 â / {que} {END} { add("a"); } // W G | |
121 ûs / {END} { add("us"); } // W G | |
122 âr { add("ar"); } // W (G) --> this is only a rough approximation! | |
123 | |
124 // 2.4 trema | |
125 // 2.4.1 common cases | |
126 aë { cv = VOWEL; add("ae"); } | |
127 oë { cv = VOWEL; add("oe"); } | |
128 // 2.4.2 rare cases | |
129 oï { cv = VOWEL; add("oi"); } | |
130 uï { cv = VOWEL; add("ui"); } | |
131 // 2.4.3 extremely rare cases | |
132 uü { cv = VOWEL; add("uu"); } | |
133 | |
134 | |
135 // 3. rules for u and v | |
136 | |
137 // 3.1 rules for u --> v | |
138 | |
139 // peruenias --> pervenias, interuallum --> intervallum | |
140 ^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS ! | |
141 | |
142 // uellet --> vellet | |
143 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } | |
144 | |
145 // diuidatur --> dividatur | |
146 // ut, volui: unchanged | |
147 // no rule for veruina because we cannot distinguish it from volui | |
148 [uU] / {Vowel} { | |
149 switch(cv) { | |
150 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; | |
151 default: cv = VOWEL; add(yytext()); break; | |
152 } | |
153 } | |
154 | |
155 // 3.2 rules for v --> u | |
156 | |
157 // qvam --> quam | |
158 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant | |
159 Qv { cv = CONS; add("Qu"); } | |
160 QV { cv = CONS; add("QU"); } | |
161 | |
162 // febrvarius --> februarius | |
163 // curva: unchanged | |
164 {LR} [vV] { | |
165 switch(cv) { | |
166 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; | |
167 default: cv = CONS; add(yytext()); break; | |
168 } | |
169 } | |
170 | |
171 // februarivs --> februarius | |
172 v / {lb} {Cons} { cv = CONS; add("u"); } | |
173 V / {lb} {Cons} { cv = CONS; add("U"); } | |
174 | |
175 // 3.3 override default rule for . | |
176 | |
177 {Vowel} { cv = VOWEL; add(yytext()); } | |
178 {Cons} { cv = CONS; add(yytext()); } | |
179 [yY] { cv = 0; add(yytext()); } | |
180 | |
181 @ { problem = 1; cv = 0; add(yytext()); } | |
182 {LB} { add(yytext()); } | |
183 . { problem = 1; cv = 0; add(yytext()); } // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç | |
184 | |
185 } | |
186 | |
187 | |
188 <DISP, RENAISSANCE_DISP> { | |
189 | |
190 {END} { | |
191 switch (problem) { | |
192 case 1: return original; | |
193 default: return normalized; | |
194 } | |
195 } | |
196 } | |
197 | |
198 <DICT, RENAISSANCE_DICT> { | |
199 | |
200 {END} { | |
201 switch (problem) { | |
202 case 1: return ""; | |
203 default: return normalized.replaceAll(LB, ""); | |
204 } | |
205 } | |
206 } | |
207 | |
208 <SEARCH, RENAISSANCE_SEARCH> { | |
209 | |
210 {END} { | |
211 switch (problem) { | |
212 case 1: return original; | |
213 default: return normalized.replaceAll(LB, "").toLowerCase(); | |
214 } | |
215 } | |
216 } | |
217 | |
218 | |
219 /* | |
220 | |
221 Annahmen: | |
222 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
223 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert | |
224 | |
225 | |
226 TO DO: | |
227 | |
228 LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? | |
229 LA: Diakritika nochmal mit Paul durchgehen | |
230 LA: Die Disambiguierungen durch die Diakritika fehlen noch. | |
231 LA: ist J wirklich ein Problemfall? | |
232 LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben? | |
233 | |
234 */ |