Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/bin/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 /* | |
2 * Normalization rules for Chinese text | |
3 * [this is a JFlex specification] | |
4 * | |
5 * Wolfgang Schmidle | |
6 * version 2011-02-28 | |
7 * | |
8 */ | |
9 | |
10 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang; | |
11 | |
12 %% | |
13 | |
14 %public | |
15 %class MpdlNormalizerLexZH | |
16 %type java.lang.String | |
17 %unicode | |
18 | |
19 // classical Chinese: zh, zho, zho-Hant | |
20 | |
21 %states DISP, DICT, SEARCH | |
22 | |
23 %{ | |
24 private String original = ""; | |
25 private String normalized = ""; | |
26 private int problem = 0; | |
27 | |
28 private void add (String norm) { | |
29 original += yytext(); | |
30 normalized += norm; | |
31 } | |
32 %} | |
33 | |
34 ZWS = [\u{200b}] | |
35 | |
36 END = \n | |
37 | |
38 %% | |
39 | |
40 // Normalization in Chinese means that character variants will be replaced by their standard characters | |
41 // if there is no doubt about what the standard character is. | |
42 | |
43 // The input is supposed to be a single Chinese character, but strings of characters are also handled correctly. | |
44 | |
45 <DISP, DICT, SEARCH> { | |
46 | |
47 // Codepoint < FFFF | |
48 | |
49 倂 { add("併"); } // 5002 --> 4F75 | |
50 傁 | 叜 { add("叟"); } // 5081, 53DC --> 53DF | |
51 竒 { add("奇"); } // 7AD2 --> 5947 | |
52 幷 { add("并"); } // 5E77 --> 5E76 | |
53 牀 { add("床"); } // 7240 --> 5E8A | |
54 旹 { add("時"); } // 65F9 --> 6642 | |
55 歴 { add("歷"); } // 6B74 --> 6B77 | |
56 爲 { add("為"); } // 7232 --> 70BA | |
57 隂 { add("陰"); } // 9682 --> 9670 | |
58 靣 { add("面"); } // 9763 --> 9762 | |
59 精 { add("精"); } // FA1D --> 7CBE (FA1D is a compatibility ideograph) | |
60 | |
61 // Codepoint > FFFF | |
62 | |
63 // note that [ABC] is not equivalent to A | B | C for codepoints above FFFF due to their internal encoding: | |
64 // for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D | |
65 // i.e. never use [ABC] but A | B | C | |
66 | |
67 庶 { add("庶"); } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph) | |
68 | |
69 } | |
70 | |
71 <DICT, SEARCH> { | |
72 | |
73 // remove Zero Width Space (if there is any in the the input string) | |
74 | |
75 {ZWS} { add(""); } | |
76 | |
77 } | |
78 | |
79 // default | |
80 | |
81 @ { problem = 1; add(yytext()); } | |
82 . { add(yytext()); } | |
83 | |
84 | |
85 <DISP, SEARCH> { | |
86 | |
87 {END} { | |
88 switch (problem) { | |
89 case 1: return original; | |
90 default: return normalized; | |
91 } | |
92 } | |
93 } | |
94 | |
95 <DICT> { | |
96 | |
97 {END} { | |
98 switch (problem) { | |
99 case 1: return ""; | |
100 default: return normalized; | |
101 } | |
102 } | |
103 } | |
104 | |
105 | |
106 /* | |
107 | |
108 Annahmen: | |
109 - die Routine wird zeichenweise (oder mit mehr als einem Zeichen) aufgerufen, mit einem \n am Ende des Strings | |
110 - es gibt keine Zeilenumbrüche | |
111 | |
112 TO DO: | |
113 | |
114 ZH: Liste ergänzen | |
115 ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren. | |
116 ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist? | |
117 ZH: sollen Zeilenumbrüche rausgenommen werden, auch wenn sie in korrekt markiertem Text nicht vorkommen? | |
118 ZH: was ist, wenn beijing übergeben wird und einen Zeilenumbruch enthält? Verlässt sich der Wrapper darauf, dass die Zeichenzahl gleich bleibt, oder macht er ein hyphen rein? was macht <place> oder <reg>? | |
119 | |
120 */ |