comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/Normalizer.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 7d6d969b10cf
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm; 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.norm;
2 2
3 import java.io.IOException; 3 import java.io.IOException;
4 import java.io.StringReader; 4 import java.io.StringReader;
5 import java.util.ArrayList;
6 5
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 6 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
8 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; 7 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR; 8 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexAR;
10 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE; 9 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexDE;
13 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR; 12 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexFR;
14 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT; 13 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexIT;
15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA; 14 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexLA;
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL; 15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexNL;
17 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH; 16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang.MpdlNormalizerLexZH;
18 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.Regularization;
19 import de.mpg.mpiwg.berlin.mpdl.lt.text.reg.RegularizationManager;
20 17
21 public class Normalizer { 18 public class Normalizer {
22 public static int NONE = -1; // no normalization 19 public static int NONE = -1; // no normalization
23 public static int DISPLAY = 1; // normalization in DISPLAY mode 20 public static int DISPLAY = 1; // normalization in DISPLAY mode
24 public static int DICTIONARY = 2; // normalization in DICTIONARY mode 21 public static int DICTIONARY = 2; // normalization in DICTIONARY mode
55 */ 52 */
56 public String normalize(String s) throws ApplicationException { 53 public String normalize(String s) throws ApplicationException {
57 String normStr = s; 54 String normStr = s;
58 if (useSpecialNormFunction()) 55 if (useSpecialNormFunction())
59 normStr = removeSpecialNWDMarks(normStr); 56 normStr = removeSpecialNWDMarks(normStr);
60 if (useRegFunction()) {
61 // try to regularize the string to the norm form over predefined regularizations
62 RegularizationManager regManager = RegularizationManager.getInstance();
63 ArrayList<Regularization> regs = regManager.findRegsByOrig(language, s);
64 if (regs != null && regs.size() > 0) {
65 Regularization reg = regs.get(0); // only one: the first one
66 String regNormStr = reg.getNorm();
67 normStr = regNormStr;
68 }
69 }
70 if (useNormFunction()) { 57 if (useNormFunction()) {
71 // normalize the string by string replacements 58 // normalize the string by string replacements
72 if (normMode == DICTIONARY) { 59 if (normMode == DICTIONARY) {
73 normStr = normalize(normStr, DICTIONARY); 60 normStr = normalize(normStr, DICTIONARY);
74 } else if (normMode == DISPLAY) { 61 } else if (normMode == DISPLAY) {
80 if (useSpecialNormFunction()) 67 if (useSpecialNormFunction())
81 normStr = insertSpecialNWDMarks(normStr); 68 normStr = insertSpecialNWDMarks(normStr);
82 return normStr; 69 return normStr;
83 } 70 }
84 71
85 private boolean useRegFunction() {
86 boolean useReg = false;
87 for (int i=0; i< normFunctions.length; i++) {
88 String function = normFunctions[i];
89 if (function.equals("reg"))
90 return true;
91 }
92 return useReg;
93 }
94
95 private boolean useNormFunction() { 72 private boolean useNormFunction() {
96 boolean useNorm = false; 73 boolean useNorm = false;
97 for (int i=0; i< normFunctions.length; i++) { 74 for (int i=0; i< normFunctions.length; i++) {
98 String function = normFunctions[i]; 75 String function = normFunctions[i];
99 if (function.equals("norm") || function.equals("specialNorm")) 76 if (function.equals("norm") || function.equals("specialNorm"))
186 } else if (Language.getInstance().isFrench(language)) { 163 } else if (Language.getInstance().isFrench(language)) {
187 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); 164 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader);
188 if (mode == DISPLAY) 165 if (mode == DISPLAY)
189 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); 166 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
190 else if (mode == DICTIONARY) 167 else if (mode == DICTIONARY)
191 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT_ASCII); 168 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DICT);
192 else if (mode == SEARCH) 169 else if (mode == SEARCH)
193 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH); 170 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.SEARCH);
194 while (token != null) { 171 while (token != null) {
195 token = mpdlNormalizerLex.yylex(); 172 token = mpdlNormalizerLex.yylex();
196 if (token != null) 173 if (token != null)