Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/lang/MpdlNormalizerLexAR.lex @ 14:5df60f24e997
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 29 Aug 2011 17:40:02 +0200 |
parents | 1ec29fdd0db8 |
children |
comparison
equal
deleted
inserted
replaced
13:469d927b9ca7 | 14:5df60f24e997 |
---|---|
1 /* | 1 /* |
2 * Normalization rules for Arabic text | 2 * Normalization rules for Arabic text |
3 * [this is a JFlex specification] | 3 * [this is a JFlex specification] |
4 * | 4 * |
5 * Wolfgang Schmidle | 5 * Wolfgang Schmidle |
6 * version 0.96 | 6 * version 2011-02-28 |
7 * 2011-02-21 | |
8 * | 7 * |
9 */ | 8 */ |
10 | 9 |
11 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; | 10 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang; |
12 | 11 |
28 | 27 |
29 private void add (String norm) { | 28 private void add (String norm) { |
30 original += yytext(); | 29 original += yytext(); |
31 normalized += norm; | 30 normalized += norm; |
32 } | 31 } |
32 | |
33 private static final String LB = "[\u002d\u00ad] "; | |
33 %} | 34 %} |
35 | |
36 hyphen = [-\u{00ad}] // hyphen and soft hyphen | |
37 LB = {hyphen} \u0020 | |
38 // lb = ({hyphen} \u0020)? | |
34 | 39 |
35 END = \n | 40 END = \n |
36 | 41 |
37 %% | 42 %% |
38 | 43 |
39 @ { problem = 1; add(yytext()); } | 44 @ { problem = 1; add(yytext()); } |
45 {LB} { add(yytext()); } | |
40 . { add(yytext()); } | 46 . { add(yytext()); } |
41 | 47 |
42 | 48 |
43 <DISP, SEARCH> { | 49 <DISP> { |
44 | 50 |
45 {END} { | 51 {END} { |
46 switch (problem) { | 52 switch (problem) { |
47 case 1: return original; | 53 case 1: return original; |
48 default: return normalized; | 54 default: return normalized; |
53 <DICT> { | 59 <DICT> { |
54 | 60 |
55 {END} { | 61 {END} { |
56 switch (problem) { | 62 switch (problem) { |
57 case 1: return ""; | 63 case 1: return ""; |
58 default: return normalized; | 64 default: return normalized.replaceAll(LB, ""); |
65 } | |
66 } | |
67 } | |
68 | |
69 <SEARCH> { | |
70 | |
71 {END} { | |
72 switch (problem) { | |
73 case 1: return original; | |
74 default: return normalized.replaceAll(LB, ""); | |
59 } | 75 } |
60 } | 76 } |
61 } | 77 } |
62 | 78 |
63 | 79 |
64 /* | 80 /* |
65 | 81 |
66 Annahmen: | 82 Annahmen: |
67 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings | 83 - die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings |
68 - Wörter mit Zeilenumbrüchen wurden bereits wieder zusammengesetzt | 84 - Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert |
69 | 85 |
70 TO DO: | 86 TO DO: |
71 | 87 |
72 AR: fehlt noch | 88 AR: fehlt noch |
73 | 89 |