Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAll.lex @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:94305c504178 | 6:2396a569e446 |
---|---|
1 /* | |
2 * Normalization rules for all languages | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * 2011-01-25 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexAll | |
16 %type java.lang.String | |
17 %unicode | |
18 // %debug | |
19 | |
20 %states LA, ZH | |
21 | |
22 %{ | |
23 int cv = 0; // consonant = 1, vowel = 2, everything else = 0 | |
24 %} | |
25 | |
26 VOWEL=[AEIOUaeiouÆæęàèòùœ] | |
27 CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß] | |
28 LR=[lLrR] | |
29 QUE=(que)? | |
30 END=\n | |
31 | |
32 %% | |
33 | |
34 <LA> { | |
35 | |
36 // 1. simple replacements | |
37 | |
38 // 1.1 single characters | |
39 ſ { cv = 1; return "s"; } | |
40 ß { cv = 1; return "ss"; } | |
41 [æę] { cv = 2; return "ae"; } | |
42 Æ { cv = 2; return "AE"; } | |
43 œ { cv = 2; return "oe"; } | |
44 // 1.2 character combinations | |
45 ij { cv = 2; return "ii"; } | |
46 | |
47 // 2. diacritics | |
48 | |
49 // 2.1 superfluous diacritics in single words | |
50 ^ hîc {END} { return "hic"; } | |
51 | |
52 // 2.2 superfluous diacritics at the end of a word | |
53 // 2.2.1 common cases | |
54 à / {QUE} {END} { return "a"; } | |
55 àm / {QUE} {END} { return "am"; } | |
56 às / {QUE} {END} { return "as"; } // (-àsque will likely never occur) | |
57 // à / [ms]? {QUE} {END} { return "a"; } | |
58 è / {QUE} {END} { return "e"; } | |
59 ò / {QUE} {END} { return "o"; } | |
60 òd / {QUE} {END} { return "od"; } | |
61 ùm / {QUE} {END} { return "um"; } | |
62 ùs / {QUE} {END} { return "us"; } | |
63 | |
64 // 2.3 superfluous diacritics within a word | |
65 // 2.3.1 common cases | |
66 aë { cv = 2; return "ae"; } | |
67 oë { cv = 2; return "oe"; } | |
68 // 2.3.2 rare cases | |
69 oï { cv = 2; return "oi"; } | |
70 uï { cv = 2; return "ui"; } | |
71 // 2.3.3 extremely rare cases | |
72 uü { cv = 2; return "uu"; } | |
73 | |
74 // 3. rules for u and v | |
75 | |
76 // 3.1 rules for u | |
77 | |
78 u/{VOWEL} { | |
79 switch(cv) { | |
80 case 2: return "v"; | |
81 default: cv = 2; return "u"; | |
82 } | |
83 } | |
84 U/{VOWEL} { | |
85 switch(cv) { | |
86 case 2: return "V"; | |
87 default: cv = 2; return "U"; | |
88 } | |
89 } | |
90 | |
91 // 3.2 rules for v | |
92 | |
93 qv { cv = 1; return "qu"; } // the replaced v still counts as consonant | |
94 Qv { cv = 1; return "Qu"; } | |
95 QV { cv = 1; return "QU"; } | |
96 | |
97 {LR}v { | |
98 switch(cv) { | |
99 case 1: return yytext().replace("v", "u"); | |
100 default: cv = 1; return yytext(); | |
101 } | |
102 } | |
103 {LR}V { | |
104 switch(cv) { | |
105 case 1: return yytext().replace("V", "U"); | |
106 default: cv = 1; return yytext(); | |
107 } | |
108 } | |
109 | |
110 v/{CONS} { cv = 1; return "u"; } | |
111 V/{CONS} { cv = 1; return "U"; } | |
112 | |
113 | |
114 // default | |
115 | |
116 {VOWEL} { cv = 2; return yytext(); } | |
117 {CONS} { cv = 1; return yytext(); } | |
118 \n { cv = 0; return ""; } | |
119 . { cv = 0; return yytext(); } | |
120 | |
121 } | |
122 | |
123 <ZH> { | |
124 | |
125 // Codepoint < FFFF | |
126 | |
127 竒 { return "奇"; } // 7AD2 --> 5947 | |
128 旹 { return "時"; } // 65F9 --> 6642 | |
129 歴 { return "歷"; } // 6B74 --> 6B77 | |
130 精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph) | |
131 | |
132 // Codepoint > FFFF | |
133 | |
134 庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) | |
135 | |
136 | |
137 } | |
138 | |
139 | |
140 // default (can be overridden by individual languages) | |
141 | |
142 \n { return ""; } | |
143 . { return yytext(); } |