Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexLA.lex @ 9:1ec29fdd0db8
neue .lex Dateien f?r Normalisierung / externe Objekte update
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 22 Feb 2011 16:03:45 +0100 |
parents | |
children | 5df60f24e997 |
comparison
equal
deleted
inserted
replaced
8:d2a1c14fde31 | 9:1ec29fdd0db8 |
---|---|
1 /* | |
2 * Normalization rules for Latin text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 0.96 | |
7 * 2011-02-21 | |
8 * | |
9 */ | |
10 | |
11 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
12 | |
13 %% | |
14 | |
15 %public | |
16 %class MpdlNormalizerLexLA | |
17 %type java.lang.String | |
18 %unicode | |
19 | |
20 // Latin: la, lat | |
21 | |
22 %states DISP, DICT, SEARCH | |
23 %states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH | |
24 | |
25 %{ | |
26 private static final int CONS = 1; | |
27 private static final int VOWEL = 2; | |
28 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 | |
29 | |
30 private String original = ""; | |
31 private String normalized = ""; | |
32 private int problem = 0; | |
33 | |
34 private void add (String norm) { | |
35 original += yytext(); | |
36 normalized += norm; | |
37 } | |
38 %} | |
39 | |
40 Vowel = [AEIOUaeiou] // without Ææęàèòùœ | |
41 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] | |
42 LR = [lLrR] | |
43 | |
44 hyphen = [\u002d\u00ad] // hyphen and soft hyphen | |
45 X = {hyphen}? | |
46 | |
47 END = \n | |
48 | |
49 que = (que)? // optional -que | |
50 enclitic = (que | ve | ne) | |
51 prefixCons = (in{X}ter | per | ſu{X}per | ſer) // "ſer" for forms of ſervare | |
52 | |
53 %% | |
54 | |
55 | |
56 // TEST, siehe Benedetti Seite 444 | |
57 𐆑 { add("X"); } // (U+10191; D800+DD91) | |
58 | |
59 | |
60 <DISP, DICT, SEARCH, | |
61 RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH> { | |
62 | |
63 // 1. simple replacements | |
64 | |
65 // 1.1 single characters | |
66 ſ { cv = CONS; add("s"); } | |
67 ß { cv = CONS; add("ss"); } | |
68 [æę] { cv = VOWEL; add("ae"); } | |
69 Æ { cv = VOWEL; add("AE"); } | |
70 œ { cv = VOWEL; add("oe"); } | |
71 | |
72 // 1.2 character combinations | |
73 ij { cv = VOWEL; add("ii"); } | |
74 | |
75 // 2. superfluous diacritics | |
76 | |
77 // 2.1 acute accent | |
78 q́ue / {END} { add("que"); } // G | |
79 á / [mrst]? {enclitic} {END} { add("a"); } // G | |
80 é / [mrst]? {enclitic} {END} { add("e"); } // G | |
81 í / [mrst]? {enclitic} {END} { add("i"); } // G | |
82 ó / [mrst]? {enclitic} {END} { add("o"); } // G | |
83 ú / [mrst]? {enclitic} {END} { add("u"); } // G | |
84 | |
85 úe / {END} { add("ve"); } // W ?? | |
86 | |
87 // 2.2 grave accent | |
88 à / {que} {END} { add("a"); } // W G | |
89 àm / {que} {END} { add("am"); } // W (G) | |
90 às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur) | |
91 è / {que} {END} { add("e"); } // W G | |
92 ò / {que} {END} { add("o"); } // W G | |
93 òd / {que} {END} { add("od"); } // W (G) | |
94 ùm / {que} {END} { add("um"); } // W (G) | |
95 ùs / {que} {END} { add("us"); } // W G | |
96 | |
97 ès / {que} {END} { add("es"); } // (G) | |
98 ^ quì / {END} { add("qui"); } // W ?? | |
99 ^ Quì / {END} { add("Qui"); } // W ?? | |
100 àc / {END} { add("ac"); } // W ?? | |
101 èr / {END} { add("er"); } // W ?? | |
102 èt / {END} { add("et"); } // W ?? | |
103 ù / {END} { add("u"); } // W ?? | |
104 ùl / {END} { add("ul"); } // W ?? | |
105 | |
106 // 2.3 circumflex accent | |
107 ^ hîc / {END} { add("hic"); } // W G | |
108 ^ Hîc / {END} { add("Hic"); } // W G | |
109 ^ ô / {END} { add("o"); } // G | |
110 â / {que} {END} { add("a"); } // W G | |
111 ûs / {END} { add("us"); } // W G | |
112 âr { add("ar"); } // W (G) --> this is only a rough approximation! | |
113 | |
114 // 2.4 trema | |
115 // 2.4.1 common cases | |
116 aë { cv = VOWEL; add("ae"); } | |
117 oë { cv = VOWEL; add("oe"); } | |
118 // 2.4.2 rare cases | |
119 oï { cv = VOWEL; add("oi"); } | |
120 uï { cv = VOWEL; add("ui"); } | |
121 // 2.4.3 extremely rare cases | |
122 uü { cv = VOWEL; add("uu"); } | |
123 | |
124 | |
125 // 3. rules for u and v | |
126 | |
127 // 3.1 rules for u --> v | |
128 | |
129 // peruenias --> pervenias, interuallum --> intervallum | |
130 ^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS ! | |
131 | |
132 // uellet --> vellet | |
133 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } | |
134 | |
135 // diuidatur --> dividatur | |
136 // ut, volui: unchanged | |
137 // no rule for veruina because we cannot distinguish it from volui | |
138 [uU] / {Vowel} { | |
139 switch(cv) { | |
140 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; | |
141 default: cv = VOWEL; add(yytext()); break; | |
142 } | |
143 } | |
144 | |
145 // 3.2 rules for v --> u | |
146 | |
147 // qvam --> quam | |
148 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant | |
149 Qv { cv = CONS; add("Qu"); } | |
150 QV { cv = CONS; add("QU"); } | |
151 | |
152 // febrvarius --> februarius | |
153 // curva: unchanged | |
154 {LR} [vV] { | |
155 switch(cv) { | |
156 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; | |
157 default: cv = CONS; add(yytext()); break; | |
158 } | |
159 } | |
160 | |
161 // februarivs --> februarius | |
162 v / {X} {Cons} { cv = CONS; add("u"); } | |
163 V / {X} {Cons} { cv = CONS; add("U"); } | |
164 | |
165 // 3.3 override default rule for . | |
166 | |
167 {Vowel} { cv = VOWEL; add(yytext()); } | |
168 {Cons} { cv = CONS; add(yytext()); } | |
169 {hyphen} { add(yytext()); } | |
170 | |
171 . { problem = 1; cv = 0; add(yytext()); } // in particular "@", and from Arboreal: "〈" (2329), "〉" (232A), Ç, ç | |
172 | |
173 } | |
174 | |
175 | |
176 <DISP, SEARCH, | |
177 RENAISSANCE_DISP, RENAISSANCE_SEARCH> { | |
178 | |
179 {END} { | |
180 switch (problem) { | |
181 case 1: return original; | |
182 default: return normalized; | |
183 } | |
184 } | |
185 } | |
186 | |
187 <DICT, | |
188 RENAISSANCE_DICT> { | |
189 | |
190 {END} { | |
191 switch (problem) { | |
192 case 1: return ""; | |
193 default: return normalized; | |
194 } | |
195 } | |
196 } | |
197 | |
198 | |
199 /* | |
200 | |
201 Annahmen: | |
202 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
203 - Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt | |
204 | |
205 | |
206 TO DO: | |
207 | |
208 LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ? | |
209 LA: Diakritika nochmal mit Paul durchgehen | |
210 LA: Die Disambiguierungen durch die Diakritika fehlen noch. | |
211 LA: ist J wirklich ein Problemfall? | |
212 LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben? | |
213 | |
214 */ |