Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; | 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; |
2 | 2 |
3 import java.io.IOException; | 3 import java.io.IOException; |
4 import java.io.StringReader; | 4 import java.io.StringReader; |
5 import java.util.ArrayList; | |
6 | 5 |
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 6 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | 7 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; |
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; | 8 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; |
10 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; | 9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; |
13 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; | 12 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; |
14 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; | 13 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; |
15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; | 14 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; |
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; | 15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; |
17 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; | 16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; |
18 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.Regularization; | |
19 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.RegularizationManager; | |
20 | 17 |
21 public class Normalizer { | 18 public class Normalizer { |
22 public static int NONE = -1; // no normalization | 19 public static int NONE = -1; // no normalization |
23 public static int DISPLAY = 1; // normalization in DISPLAY mode | 20 public static int DISPLAY = 1; // normalization in DISPLAY mode |
24 public static int DICTIONARY = 2; // normalization in DICTIONARY mode | 21 public static int DICTIONARY = 2; // normalization in DICTIONARY mode |
55 */ | 52 */ |
56 public String normalize(String s) throws ApplicationException { | 53 public String normalize(String s) throws ApplicationException { |
57 String normStr = s; | 54 String normStr = s; |
58 if (useSpecialNormFunction()) | 55 if (useSpecialNormFunction()) |
59 normStr = removeSpecialNWDMarks(normStr); | 56 normStr = removeSpecialNWDMarks(normStr); |
60 if (useRegFunction()) { | |
61 // try to regularize the string to the norm form over predefined regularizations | |
62 RegularizationManager regManager = RegularizationManager.getInstance(); | |
63 ArrayList<Regularization> regs = regManager.findRegsByOrig(language, s); | |
64 if (regs != null && regs.size() > 0) { | |
65 Regularization reg = regs.get(0); // only one: the first one | |
66 String regNormStr = reg.getNorm(); | |
67 normStr = regNormStr; | |
68 } | |
69 } | |
70 if (useNormFunction()) { | 57 if (useNormFunction()) { |
71 // normalize the string by string replacements | 58 // normalize the string by string replacements |
72 if (normMode == DICTIONARY) { | 59 if (normMode == DICTIONARY) { |
73 normStr = normalize(normStr, DICTIONARY); | 60 normStr = normalize(normStr, DICTIONARY); |
74 } else if (normMode == DISPLAY) { | 61 } else if (normMode == DISPLAY) { |
80 if (useSpecialNormFunction()) | 67 if (useSpecialNormFunction()) |
81 normStr = insertSpecialNWDMarks(normStr); | 68 normStr = insertSpecialNWDMarks(normStr); |
82 return normStr; | 69 return normStr; |
83 } | 70 } |
84 | 71 |
85 private boolean useRegFunction() { | |
86 boolean useReg = false; | |
87 for (int i=0; i< normFunctions.length; i++) { | |
88 String function = normFunctions[i]; | |
89 if (function.equals("reg")) | |
90 return true; | |
91 } | |
92 return useReg; | |
93 } | |
94 | |
95 private boolean useNormFunction() { | 72 private boolean useNormFunction() { |
96 boolean useNorm = false; | 73 boolean useNorm = false; |
97 for (int i=0; i< normFunctions.length; i++) { | 74 for (int i=0; i< normFunctions.length; i++) { |
98 String function = normFunctions[i]; | 75 String function = normFunctions[i]; |
99 if (function.equals("norm") || function.equals("specialNorm")) | 76 if (function.equals("norm") || function.equals("specialNorm")) |
186 } else if (Language.getInstance().isFrench(language)) { | 163 } else if (Language.getInstance().isFrench(language)) { |
187 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); | 164 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); |
188 if (mode == DISPLAY) | 165 if (mode == DISPLAY) |
189 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); | 166 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); |
190 else if (mode == DICTIONARY) | 167 else if (mode == DICTIONARY) |
191 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); | 168 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT); |
192 else if (mode == SEARCH) | 169 else if (mode == SEARCH) |
193 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); | 170 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); |
194 while (token != null) { | 171 while (token != null) { |
195 token = mpdlNormalizerLex.yylex(); | 172 token = mpdlNormalizerLex.yylex(); |
196 if (token != null) | 173 if (token != null) |