comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
comparison
equal deleted inserted replaced
18:dc5e9fcb3fdc 19:4a3641ae14d2
1 /*
2 * Normalization rules for German text
3 * [this is a JFlex specification]
4 *
5 * Wolfgang Schmidle
6 * version 2011-07-12
7 *
8 */
9
10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
11
12 %%
13
14 %public
15 %class MpdlNormalizerLexDE
16 %type java.lang.String
17 %unicode
18
19 // German: de, deu, ger
20
21 %states DISP, DICT, SEARCH
22 %state CELEX, GRIMM
23
24 %{
25 private String original = "";
26 private String normalized = "";
27 private int problem = 0;
28
29 private void add (String norm) {
30 original += yytext();
31 normalized += norm;
32 }
33
34 private static final String LB = "[\u002d\u00ad] ";
35 %}
36
37 hyphen = [-\u{00ad}] // hyphen and soft hyphen
38 LB = {hyphen} \u0020
39 // lb = ({hyphen} \u0020)?
40
41 END = \n
42
43 Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
44
45 %%
46
47 ſ { add("s"); }
48
49 // Fraktur
50
51 <DISP, DICT, SEARCH,
52 GRIMM> {
53
54 uͦ {add("u"); }
55 aͤ {add("ä"); }
56 oͤ {add("ö"); }
57 uͤ {add("ü"); }
58
59 }
60
61 <CELEX> {
62
63 // normalize ä ö ü ß only for Celex!
64
65 ä | Ä | aͤ { add("ae"); }
66 ö | Ö | oͤ { add("oe"); }
67 ü | Ü | uͤ { add("ue"); }
68 uͦ {add("u"); }
69 ß { add("ss"); }
70
71 {Alphabet} { add(yytext()); }
72
73 . { problem = 1; add(yytext()); }
74
75 }
76
77 <GRIMM> {
78
79 ß { add("sz"); }
80
81 }
82
83
84 // default
85
86 @ { problem = 1; add(yytext()); }
87 {LB} { add(yytext()); }
88 . { add(yytext()); }
89
90
91 <DISP> {
92
93 {END} {
94 switch (problem) {
95 case 1: return original;
96 default: return normalized;
97 }
98 }
99 }
100
101 <DICT, CELEX, GRIMM> {
102
103 {END} {
104 switch (problem) {
105 case 1: return "";
106 default: return normalized.replaceAll(LB, "");
107 }
108 }
109 }
110
111 <SEARCH> {
112
113 {END} {
114 switch (problem) {
115 case 1: return original;
116 default: return normalized.replaceAll(LB, "").toLowerCase();
117 }
118 }
119 }
120
121
122
123 /*
124
125 Annahmen:
126 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
127 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
128
129 TO DO:
130
131 DE: Trennung von Deutsch und Fraktur?
132 DE: Celex: hyphens weg?
133
134 */