Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* | |
2 * Template for normalization rules | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-07-12 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexTemplate | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // Language: list of ISO codes | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 | |
23 %{ | |
24 private String original = ""; | |
25 private String normalized = ""; | |
26 private int problem = 0; | |
27 | |
28 private void add (String norm) { | |
29 original += yytext(); | |
30 normalized += norm; | |
31 } | |
32 | |
33 private static final String LB = "[\u002d\u00ad] "; | |
34 %} | |
35 | |
36 hyphen = [-\u{00ad}] // hyphen and soft hyphen | |
37 LB = {hyphen} \u0020 | |
38 // lb = ({hyphen} \u0020)? | |
39 | |
40 END = \n | |
41 | |
42 %% | |
43 | |
44 <DISP, DICT, SEARCH> { | |
45 | |
46 ſ { add("s"); } // sample rule | |
47 | |
48 } | |
49 | |
50 | |
51 // default rules | |
52 | |
53 @ { problem = 1; add(yytext()); } | |
54 {LB} { add(yytext()); } | |
55 . { add(yytext()); } | |
56 | |
57 | |
58 // at the end, determine which string to return | |
59 | |
60 <DISP> { | |
61 | |
62 {END} { | |
63 switch (problem) { | |
64 case 1: return original; | |
65 default: return normalized; | |
66 } | |
67 } | |
68 } | |
69 | |
70 <DICT> { | |
71 | |
72 {END} { | |
73 switch (problem) { | |
74 case 1: return ""; | |
75 default: return normalized.replaceAll(LB, ""); | |
76 } | |
77 } | |
78 } | |
79 | |
80 <SEARCH> { | |
81 | |
82 {END} { | |
83 switch (problem) { | |
84 case 1: return original; | |
85 default: return normalized.replaceAll(LB, "").toLowerCase(); | |
86 } | |
87 } | |
88 } | |
89 |