Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* | |
2 * Normalization rules for Italian text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-07-12 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexIT | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // Italian: it, ita | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 | |
23 %{ | |
24 private static final int CONS = 1; | |
25 private static final int VOWEL = 2; | |
26 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 | |
27 | |
28 private String original = ""; | |
29 private String normalized = ""; | |
30 private int problem = 0; | |
31 | |
32 private void add (String norm) { | |
33 original += yytext(); | |
34 normalized += norm; | |
35 } | |
36 | |
37 private static final String LB = "[\u002d\u00ad] "; | |
38 %} | |
39 | |
40 Vowel = [AEIOUaeiouÆæęàèòùœ] | |
41 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] | |
42 LR = [lLrR] | |
43 | |
44 | |
45 hyphen = [\u002d\u00ad] // hyphen and soft hyphen | |
46 LB = {hyphen} \u0020 | |
47 lb = ({hyphen} \u0020)? | |
48 | |
49 END = \n | |
50 | |
51 prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) | |
52 | |
53 %% | |
54 | |
55 // jump over empty xml elements | |
56 "<"[^><]+"/>" { add(yytext()); } | |
57 "-<"[^><]+"/>" { add(yytext()); } | |
58 "<"[^><]+"></"[^><]+">" { add(yytext()); } | |
59 "-<"[^><]+"></"[^><]+">" { add(yytext()); } | |
60 | |
61 <DICT, SEARCH> { | |
62 | |
63 À { add("Á"); } | |
64 È { add("É"); } | |
65 Ì { add("Í"); } | |
66 Ò { add("Ó"); } | |
67 Ù { add("Ú"); } | |
68 à { add("á"); } | |
69 è { add("é"); } | |
70 ì { add("í"); } | |
71 ò { add("ó"); } | |
72 ù { add("ú"); } | |
73 | |
74 } | |
75 | |
76 <DISP, DICT, SEARCH> { | |
77 | |
78 ſ { cv = CONS; add("s"); } | |
79 ß { cv = CONS; add("ss"); } | |
80 æ { cv = VOWEL; add("ae"); } | |
81 Æ { cv = VOWEL; add("AE"); } | |
82 œ { cv = VOWEL; add("oe"); } | |
83 Œ { cv = VOWEL; add("OE"); } | |
84 | |
85 ij { cv = VOWEL; add("ii"); } | |
86 | |
87 tio { cv = VOWEL; add("zio"); } | |
88 TIO { cv = VOWEL; add("ZIO"); } | |
89 | |
90 // h-Regeln aus Arboreal: | |
91 ^ ha / {END} { add(yytext()); } | |
92 ^ hai / {END} { add(yytext()); } | |
93 ^ han{lb}no / {END} { add(yytext()); } | |
94 ^ ho / {END} { add(yytext()); } | |
95 ^ h { add(""); } | |
96 | |
97 | |
98 // u/v rules are taken from MpdlNormalizerLexLA.lex | |
99 | |
100 // 1. rules for u --> v | |
101 | |
102 ^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } | |
103 | |
104 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } | |
105 | |
106 | |
107 [uU] / {Vowel} { | |
108 switch(cv) { | |
109 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; | |
110 default: cv = VOWEL; add(yytext()); break; | |
111 } | |
112 } | |
113 | |
114 // 2. rules for v --> u | |
115 | |
116 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant | |
117 Qv { cv = CONS; add("Qu"); } | |
118 QV { cv = CONS; add("QU"); } | |
119 | |
120 {LR} [vV] { | |
121 switch(cv) { | |
122 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; | |
123 default: cv = CONS; add(yytext()); break; | |
124 } | |
125 } | |
126 | |
127 v / {lb} {Cons} { cv = CONS; add("u"); } | |
128 V / {lb} {Cons} { cv = CONS; add("U"); } | |
129 | |
130 // 3. override default rule for . | |
131 | |
132 {Vowel} { cv = VOWEL; add(yytext()); } | |
133 {Cons} { cv = CONS; add(yytext()); } | |
134 @ { problem = 1; cv = 0; add(yytext()); } | |
135 {LB} { add(yytext()); } | |
136 . { cv = 0; add(yytext()); } | |
137 | |
138 } | |
139 | |
140 | |
141 <DISP> { | |
142 | |
143 {END} { | |
144 switch (problem) { | |
145 case 1: return original; | |
146 default: return normalized; | |
147 } | |
148 } | |
149 } | |
150 | |
151 <DICT> { | |
152 | |
153 {END} { | |
154 switch (problem) { | |
155 case 1: return ""; | |
156 default: return normalized.replaceAll(LB, ""); | |
157 } | |
158 } | |
159 } | |
160 | |
161 <SEARCH> { | |
162 | |
163 {END} { | |
164 switch (problem) { | |
165 case 1: return original; | |
166 default: return normalized.replaceAll(LB, "").toLowerCase(); | |
167 } | |
168 } | |
169 } | |
170 | |
171 | |
172 /* | |
173 | |
174 Annahmen: | |
175 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
176 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert | |
177 | |
178 TO DO: | |
179 | |
180 IT: all these rules are taken from Arboreal; do we need them all? | |
181 IT: richtig? vollständig? | |
182 IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse? | |
183 IT: Änderungen in den lateinischen u/v-Regeln übernehmen? | |
184 IT: italienische Beispielwörter für die u/v-Regeln angeben | |
185 IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich? | |
186 IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe? | |
187 IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig? | |
188 | |
189 */ |