Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexIT.lex @ 9:1ec29fdd0db8
neue .lex Dateien f?r Normalisierung / externe Objekte update
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 22 Feb 2011 16:03:45 +0100 |
parents | |
children | 5df60f24e997 |
comparison
equal
deleted
inserted
replaced
8:d2a1c14fde31 | 9:1ec29fdd0db8 |
---|---|
1 /* | |
2 * Normalization rules for Italian text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 0.96 | |
7 * 2011-02-21 | |
8 * | |
9 */ | |
10 | |
11 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
12 | |
13 %% | |
14 | |
15 %public | |
16 %class MpdlNormalizerLexIT | |
17 %type java.lang.String | |
18 %unicode | |
19 | |
20 // Italian: it, ita | |
21 | |
22 %states DISP, DICT, SEARCH | |
23 | |
24 %{ | |
25 private static final int CONS = 1; | |
26 private static final int VOWEL = 2; | |
27 private int cv = 0; // consonant = 1, vowel = 2, everything else = 0 | |
28 | |
29 private String original = ""; | |
30 private String normalized = ""; | |
31 private int problem = 0; | |
32 | |
33 private void add (String norm) { | |
34 original += yytext(); | |
35 normalized += norm; | |
36 } | |
37 %} | |
38 | |
39 Vowel = [AEIOUaeiouÆæęàèòùœ] | |
40 Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] | |
41 LR = [lLrR] | |
42 | |
43 | |
44 hyphen = [\u002d\u00ad] // hyphen and soft hyphen | |
45 X = {hyphen}? | |
46 | |
47 END = \n | |
48 | |
49 prefixCons = (in{X}ter | per | ſu{X}per | ſer) | |
50 | |
51 %% | |
52 | |
53 <DICT, SEARCH> { | |
54 | |
55 À { add("Á"); } | |
56 È { add("É"); } | |
57 Ì { add("Í"); } | |
58 Ò { add("Ó"); } | |
59 Ù { add("Ú"); } | |
60 à { add("á"); } | |
61 è { add("é"); } | |
62 ì { add("í"); } | |
63 ò { add("ó"); } | |
64 ù { add("ú"); } | |
65 | |
66 } | |
67 | |
68 <DISP, DICT, SEARCH> { | |
69 | |
70 ſ { cv = CONS; add("s"); } | |
71 ß { cv = CONS; add("ss"); } | |
72 æ { cv = VOWEL; add("ae"); } | |
73 Æ { cv = VOWEL; add("AE"); } | |
74 œ { cv = VOWEL; add("oe"); } | |
75 Œ { cv = VOWEL; add("OE"); } | |
76 | |
77 ij { cv = VOWEL; add("ii"); } | |
78 | |
79 tio { cv = VOWEL; add("zio"); } | |
80 TIO { cv = VOWEL; add("ZIO"); } | |
81 | |
82 // h-Regeln aus Arboreal: | |
83 ^ ha / {END} { add(yytext()); } | |
84 ^ hai / {END} { add(yytext()); } | |
85 ^ han{X}no / {END} { add(yytext()); } | |
86 ^ ho / {END} { add(yytext()); } | |
87 ^ h { add(""); } | |
88 | |
89 | |
90 // u/v rules are taken from MpdlNormalizerLexLA.lex | |
91 | |
92 // 1. rules for u --> v | |
93 | |
94 ^ {prefixCons} / {X} { cv = VOWEL; add(yytext().replace("ſ", "s")); } | |
95 | |
96 ^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); } | |
97 | |
98 | |
99 [uU] / {Vowel} { | |
100 switch(cv) { | |
101 case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break; | |
102 default: cv = VOWEL; add(yytext()); break; | |
103 } | |
104 } | |
105 | |
106 // 2. rules for v --> u | |
107 | |
108 qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant | |
109 Qv { cv = CONS; add("Qu"); } | |
110 QV { cv = CONS; add("QU"); } | |
111 | |
112 {LR} [vV] { | |
113 switch(cv) { | |
114 case CONS: add(yytext().replace("v", "u").replace("V", "U")); break; | |
115 default: cv = CONS; add(yytext()); break; | |
116 } | |
117 } | |
118 | |
119 v / {X} {Cons} { cv = CONS; add("u"); } | |
120 V / {X} {Cons} { cv = CONS; add("U"); } | |
121 | |
122 // 3. override default rule for . | |
123 | |
124 {Vowel} { cv = VOWEL; add(yytext()); } | |
125 {Cons} { cv = CONS; add(yytext()); } | |
126 {hyphen} { add(yytext()); } | |
127 @ { problem = 1; add(yytext()); } | |
128 . { cv = 0; add(yytext()); } | |
129 | |
130 } | |
131 | |
132 | |
133 <DISP, SEARCH> { | |
134 | |
135 {END} { | |
136 switch (problem) { | |
137 case 1: return original; | |
138 default: return normalized; | |
139 } | |
140 } | |
141 } | |
142 | |
143 <DICT> { | |
144 | |
145 {END} { | |
146 switch (problem) { | |
147 case 1: return ""; | |
148 default: return normalized; | |
149 } | |
150 } | |
151 } | |
152 | |
153 | |
154 /* | |
155 | |
156 Annahmen: | |
157 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
158 - Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt | |
159 | |
160 TO DO: | |
161 | |
162 IT: all these rules are taken from Arboreal; do we need them all? | |
163 IT: richtig? vollständig? | |
164 IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse? | |
165 IT: Änderungen in den lateinischen u/v-Regeln übernehmen? | |
166 IT: italienische Beispielwörter für die u/v-Regeln angeben | |
167 IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich? | |
168 IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe? | |
169 IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig? | |
170 | |
171 */ |