Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | e845310098ba |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 /* | |
2 * Normalization rules for German text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-07-12 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexDE | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // German: de, deu, ger | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 %state CELEX, GRIMM | |
23 | |
24 %{ | |
25 private String original = ""; | |
26 private String normalized = ""; | |
27 private int problem = 0; | |
28 | |
29 private void add (String norm) { | |
30 original += yytext(); | |
31 normalized += norm; | |
32 } | |
33 | |
34 private static final String LB = "[\u002d\u00ad] "; | |
35 %} | |
36 | |
37 hyphen = [-\u{00ad}] // hyphen and soft hyphen | |
38 LB = {hyphen} \u0020 | |
39 // lb = ({hyphen} \u0020)? | |
40 | |
41 END = \n | |
42 | |
43 Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ] | |
44 | |
45 %% | |
46 | |
47 ſ { add("s"); } | |
48 | |
49 // Fraktur | |
50 | |
51 <DISP, DICT, SEARCH, | |
52 GRIMM> { | |
53 | |
54 uͦ {add("u"); } | |
55 aͤ {add("ä"); } | |
56 oͤ {add("ö"); } | |
57 uͤ {add("ü"); } | |
58 | |
59 } | |
60 | |
61 <CELEX> { | |
62 | |
63 // normalize ä ö ü ß only for Celex! | |
64 | |
65 ä | Ä | aͤ { add("ae"); } | |
66 ö | Ö | oͤ { add("oe"); } | |
67 ü | Ü | uͤ { add("ue"); } | |
68 uͦ {add("u"); } | |
69 ß { add("ss"); } | |
70 | |
71 {Alphabet} { add(yytext()); } | |
72 | |
73 . { problem = 1; add(yytext()); } | |
74 | |
75 } | |
76 | |
77 <GRIMM> { | |
78 | |
79 ß { add("sz"); } | |
80 | |
81 } | |
82 | |
83 | |
84 // default | |
85 | |
86 @ { problem = 1; add(yytext()); } | |
87 {LB} { add(yytext()); } | |
88 . { add(yytext()); } | |
89 | |
90 | |
91 <DISP> { | |
92 | |
93 {END} { | |
94 switch (problem) { | |
95 case 1: return original; | |
96 default: return normalized; | |
97 } | |
98 } | |
99 } | |
100 | |
101 <DICT, CELEX, GRIMM> { | |
102 | |
103 {END} { | |
104 switch (problem) { | |
105 case 1: return ""; | |
106 default: return normalized.replaceAll(LB, ""); | |
107 } | |
108 } | |
109 } | |
110 | |
111 <SEARCH> { | |
112 | |
113 {END} { | |
114 switch (problem) { | |
115 case 1: return original; | |
116 default: return normalized.replaceAll(LB, "").toLowerCase(); | |
117 } | |
118 } | |
119 } | |
120 | |
121 | |
122 | |
123 /* | |
124 | |
125 Annahmen: | |
126 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | |
127 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert | |
128 | |
129 TO DO: | |
130 | |
131 DE: Trennung von Deutsch und Fraktur? | |
132 DE: Celex: hyphens weg? | |
133 | |
134 */ |