comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents
children
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
1 /*
2 * Normalization rules for all languages
3 * [this is a JFlex specification]
4 *
5 * Wolfgang Schmidle
6 * 2011-01-25
7 *
8 */
9
10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
11
12 %%
13
14 %public
15 %class MpdlNormalizerLexAll
16 %type java.lang.String
17 %unicode
18 // %debug
19
20 %states LA, ZH
21
22 %{
23 int cv = 0; // consonant = 1, vowel = 2, everything else = 0
24 %}
25
26 VOWEL=[AEIOUaeiouÆæęàèòùœ]
27 CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
28 LR=[lLrR]
29 QUE=(que)?
30 END=\n
31
32 %%
33
34 <LA> {
35
36 // 1. simple replacements
37
38 // 1.1 single characters
39 ſ { cv = 1; return "s"; }
40 ß { cv = 1; return "ss"; }
41 [æę] { cv = 2; return "ae"; }
42 Æ { cv = 2; return "AE"; }
43 œ { cv = 2; return "oe"; }
44 // 1.2 character combinations
45 ij { cv = 2; return "ii"; }
46
47 // 2. diacritics
48
49 // 2.1 superfluous diacritics in single words
50 ^ hîc {END} { return "hic"; }
51
52 // 2.2 superfluous diacritics at the end of a word
53 // 2.2.1 common cases
54 à / {QUE} {END} { return "a"; }
55 àm / {QUE} {END} { return "am"; }
56 às / {QUE} {END} { return "as"; } // (-àsque will likely never occur)
57 // à / [ms]? {QUE} {END} { return "a"; }
58 è / {QUE} {END} { return "e"; }
59 ò / {QUE} {END} { return "o"; }
60 òd / {QUE} {END} { return "od"; }
61 ùm / {QUE} {END} { return "um"; }
62 ùs / {QUE} {END} { return "us"; }
63
64 // 2.3 superfluous diacritics within a word
65 // 2.3.1 common cases
66 aë { cv = 2; return "ae"; }
67 oë { cv = 2; return "oe"; }
68 // 2.3.2 rare cases
69 oï { cv = 2; return "oi"; }
70 uï { cv = 2; return "ui"; }
71 // 2.3.3 extremely rare cases
72 uü { cv = 2; return "uu"; }
73
74 // 3. rules for u and v
75
76 // 3.1 rules for u
77
78 u/{VOWEL} {
79 switch(cv) {
80 case 2: return "v";
81 default: cv = 2; return "u";
82 }
83 }
84 U/{VOWEL} {
85 switch(cv) {
86 case 2: return "V";
87 default: cv = 2; return "U";
88 }
89 }
90
91 // 3.2 rules for v
92
93 qv { cv = 1; return "qu"; } // the replaced v still counts as consonant
94 Qv { cv = 1; return "Qu"; }
95 QV { cv = 1; return "QU"; }
96
97 {LR}v {
98 switch(cv) {
99 case 1: return yytext().replace("v", "u");
100 default: cv = 1; return yytext();
101 }
102 }
103 {LR}V {
104 switch(cv) {
105 case 1: return yytext().replace("V", "U");
106 default: cv = 1; return yytext();
107 }
108 }
109
110 v/{CONS} { cv = 1; return "u"; }
111 V/{CONS} { cv = 1; return "U"; }
112
113
114 // default
115
116 {VOWEL} { cv = 2; return yytext(); }
117 {CONS} { cv = 1; return yytext(); }
118 \n { cv = 0; return ""; }
119 . { cv = 0; return yytext(); }
120
121 }
122
123 <ZH> {
124
125 // Codepoint < FFFF
126
127 竒 { return "奇"; } // 7AD2 --> 5947
128 旹 { return "時"; } // 65F9 --> 6642
129 歴 { return "歷"; } // 6B74 --> 6B77
130 精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph)
131
132 // Codepoint > FFFF
133
134 庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph)
135
136
137 }
138
139
140 // default (can be overridden by individual languages)
141
142 \n { return ""; }
143 . { return yytext(); }