Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* | |
2 * Normalization rules for Greek text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-08-03 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexEL | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // Greek: el, grc | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 %state SIGMA | |
23 | |
24 %{ | |
25 private String original = ""; | |
26 private String normalized = ""; | |
27 private int problem = 0; | |
28 | |
29 private void add (String norm) { | |
30 original += yytext(); | |
31 normalized += norm; | |
32 } | |
33 | |
34 private static final String LB = "[\u002d\u00ad] "; | |
35 %} | |
36 | |
37 hyphen = [-\u{00ad}] // hyphen and soft hyphen | |
38 LB = {hyphen} \u0020 | |
39 // lb = ({hyphen} \u0020)? | |
40 | |
41 END = \n | |
42 | |
43 wordend = [νρς]? {END} | |
44 | |
45 Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] | |
46 | |
47 | |
48 %% | |
49 | |
50 // jump over empty xml elements | |
51 "<"[^><]+"/>" { add(yytext()); } | |
52 "-<"[^><]+"/>" { add(yytext()); } | |
53 "<"[^><]+"></"[^><]+">" { add(yytext()); } | |
54 "-<"[^><]+"></"[^><]+">" { add(yytext()); } | |
55 | |
56 | |
57 // always replace tonos by oxia | |
58 // (although this should really be corrected in the text rather than normalized) | |
59 ά { add("ά"); } | |
60 έ { add("έ"); } | |
61 ή { add("ή"); } | |
62 ί { add("ί"); } | |
63 ό { add("ό"); } | |
64 ύ { add("ύ"); } | |
65 ώ { add("ώ"); } | |
66 | |
67 | |
68 <DICT, SEARCH, SIGMA> { | |
69 | |
70 ὰ / {wordend} { add("ά"); } | |
71 ᾲ / {wordend} { add("ᾴ"); } | |
72 ὲ / {wordend} { add("έ"); } | |
73 ὴ / {wordend} { add("ή"); } | |
74 ῂ / {wordend} { add("ῄ"); } | |
75 ὶ / {wordend} { add("ί"); } | |
76 ὸ / {wordend} { add("ό"); } | |
77 ὺ / {wordend} { add("ύ"); } | |
78 ὼ / {wordend} { add("ώ"); } | |
79 ῲ / {wordend} { add("ῴ"); } | |
80 | |
81 // other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ | |
82 | |
83 } | |
84 | |
85 <SIGMA> { | |
86 | |
87 ς { add("σ"); } | |
88 | |
89 } | |
90 | |
91 // default | |
92 | |
93 @ { problem = 1; add(yytext()); } | |
94 {Latin} { problem = 1; add(yytext()); } | |
95 | |
96 {LB} { add(yytext()); } | |
97 . { add(yytext()); } | |
98 | |
99 | |
100 <DISP> { | |
101 | |
102 {END} { | |
103 switch (problem) { | |
104 case 1: return original; | |
105 default: return normalized; | |
106 } | |
107 } | |
108 } | |
109 | |
110 <DICT, SIGMA> { | |
111 | |
112 {END} { | |
113 switch (problem) { | |
114 case 1: return ""; | |
115 default: return normalized.replaceAll(LB, ""); | |
116 } | |
117 } | |
118 } | |
119 | |
120 <SEARCH> { | |
121 | |
122 {END} { | |
123 switch (problem) { | |
124 case 1: return original; | |
125 default: return normalized.replaceAll(LB, "").toLowerCase(); | |
126 } | |
127 } | |
128 } | |
129 | |
130 | |
131 /* | |
132 | |
133 Annahmen: | |
134 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
135 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert | |
136 | |
137 TO DO: | |
138 | |
139 EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss? | |
140 EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann? | |
141 EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden? | |
142 EL: neuer State BETACODE ? | |
143 EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen | |
144 | |
145 */ |