comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 /*
2 * Normalization rules for German text
3 * [this is a JFlex specification]
4 *
5 * Wolfgang Schmidle
6 * version 2011-07-12
7 *
8 */
9
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
11
12 %%
13
14 %public
15 %class MpdlNormalizerLexDE
16 %type java.lang.String
17 %unicode
18
19 // German: de, deu, ger
20
21 %states DISP, DICT, SEARCH
22 %state CELEX, GRIMM
23
24 %{
25 private String original = "";
26 private String normalized = "";
27 private int problem = 0;
28
29 private void add (String norm) {
30 original += yytext();
31 normalized += norm;
32 }
33
34 private static final String LB = "[\u002d\u00ad] ";
35 %}
36
37 hyphen = [-\u{00ad}] // hyphen and soft hyphen
38 LB = {hyphen} \u0020
39 // lb = ({hyphen} \u0020)?
40
41 END = \n
42
43 Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
44
45 %%
46
47 // jump over empty xml elements
48 "<"[^><]+"/>" { add(yytext()); }
49 "-<"[^><]+"/>" { add(yytext()); }
50 "<"[^><]+"></"[^><]+">" { add(yytext()); }
51 "-<"[^><]+"></"[^><]+">" { add(yytext()); }
52
53 ſ { add("s"); }
54
55 // Fraktur
56
57 <DISP, DICT, SEARCH,
58 GRIMM> {
59
60 uͦ {add("u"); }
61 aͤ {add("ä"); }
62 oͤ {add("ö"); }
63 uͤ {add("ü"); }
64
65 }
66
67 <CELEX> {
68
69 // normalize ä ö ü ß only for Celex!
70
71 ä | Ä | aͤ { add("ae"); }
72 ö | Ö | oͤ { add("oe"); }
73 ü | Ü | uͤ { add("ue"); }
74 uͦ {add("u"); }
75 ß { add("ss"); }
76
77 {Alphabet} { add(yytext()); }
78
79 . { problem = 1; add(yytext()); }
80
81 }
82
83 <GRIMM> {
84
85 ß { add("sz"); }
86
87 }
88
89
90 // default
91
92 @ { problem = 1; add(yytext()); }
93 {LB} { add(yytext()); }
94 . { add(yytext()); }
95
96
97 <DISP> {
98
99 {END} {
100 switch (problem) {
101 case 1: return original;
102 default: return normalized;
103 }
104 }
105 }
106
107 <DICT, CELEX, GRIMM> {
108
109 {END} {
110 switch (problem) {
111 case 1: return "";
112 default: return normalized.replaceAll(LB, "");
113 }
114 }
115 }
116
117 <SEARCH> {
118
119 {END} {
120 switch (problem) {
121 case 1: return original;
122 default: return normalized.replaceAll(LB, "").toLowerCase();
123 }
124 }
125 }
126
127
128
129 /*
130
131 Annahmen:
132 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
133 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
134
135 TO DO:
136
137 DE: Trennung von Deutsch und Fraktur?
138 DE: Celex: hyphens weg?
139
140 */