Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 9:1ec29fdd0db8
neue .lex Dateien f?r Normalisierung / externe Objekte update
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 22 Feb 2011 16:03:45 +0100 |
parents | 2396a569e446 |
children | 5df60f24e997 |
comparison
equal
deleted
inserted
replaced
8:d2a1c14fde31 | 9:1ec29fdd0db8 |
---|---|
3 import java.io.IOException; | 3 import java.io.IOException; |
4 import java.io.StringReader; | 4 import java.io.StringReader; |
5 import java.util.ArrayList; | 5 import java.util.ArrayList; |
6 | 6 |
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; | 8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAR; |
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexDE; | |
10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEL; | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEN; | |
12 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexFR; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexIT; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexLA; | |
15 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexNL; | |
16 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexZH; | |
9 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; | 17 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; |
10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; | 18 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; |
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | 19 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; |
12 | 20 |
13 public class MpdlNormalizer { | 21 public class MpdlNormalizer { |
1012 return s; | 1020 return s; |
1013 } | 1021 } |
1014 } | 1022 } |
1015 | 1023 |
1016 private String normalize4HumanReaders(String s) { | 1024 private String normalize4HumanReaders(String s) { |
1017 String normStr = s; | 1025 StringReader strReader = new StringReader(s + "\n"); |
1018 StringReader strReader = new StringReader(normStr + "\n"); | |
1019 MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); | |
1020 if (Language.getInstance().isLatin(language)) { | |
1021 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); | |
1022 } else if (Language.getInstance().isChinese(language)) { | |
1023 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); | |
1024 } else { | |
1025 // TODO normalization for all languages | |
1026 return normalize4Lexica(s, null); // old function | |
1027 } | |
1028 String retStr = ""; | 1026 String retStr = ""; |
1029 String token = ""; | 1027 String token = ""; |
1030 while (token != null) { | 1028 try { |
1031 try { | 1029 if (Language.getInstance().isLatin(language)) { |
1032 token = mpdlNormalizerLexAll.yylex(); | 1030 MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader); |
1033 if (token != null) | 1031 mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP); |
1034 retStr += token; | 1032 while (token != null) { |
1035 } catch (IOException e ) { | 1033 token = mpdlNormalizerLex.yylex(); |
1036 // nothing cause IOException is not needed for a StringReader | 1034 if (token != null) |
1037 } | 1035 retStr += token; |
1036 } | |
1037 } else if (Language.getInstance().isArabic(language)) { | |
1038 MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader); | |
1039 mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP); | |
1040 while (token != null) { | |
1041 token = mpdlNormalizerLex.yylex(); | |
1042 if (token != null) | |
1043 retStr += token; | |
1044 } | |
1045 } else if (Language.getInstance().isGerman(language)) { | |
1046 MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader); | |
1047 mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP); | |
1048 while (token != null) { | |
1049 token = mpdlNormalizerLex.yylex(); | |
1050 if (token != null) | |
1051 retStr += token; | |
1052 } | |
1053 } else if (Language.getInstance().isGreek(language)) { | |
1054 MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader); | |
1055 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP); | |
1056 while (token != null) { | |
1057 token = mpdlNormalizerLex.yylex(); | |
1058 if (token != null) | |
1059 retStr += token; | |
1060 } | |
1061 } else if (Language.getInstance().isEnglish(language)) { | |
1062 MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader); | |
1063 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP); | |
1064 while (token != null) { | |
1065 token = mpdlNormalizerLex.yylex(); | |
1066 if (token != null) | |
1067 retStr += token; | |
1068 } | |
1069 } else if (Language.getInstance().isFrench(language)) { | |
1070 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader); | |
1071 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP); | |
1072 while (token != null) { | |
1073 token = mpdlNormalizerLex.yylex(); | |
1074 if (token != null) | |
1075 retStr += token; | |
1076 } | |
1077 } else if (Language.getInstance().isItalian(language)) { | |
1078 MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader); | |
1079 mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP); | |
1080 while (token != null) { | |
1081 token = mpdlNormalizerLex.yylex(); | |
1082 if (token != null) | |
1083 retStr += token; | |
1084 } | |
1085 } else if (Language.getInstance().isDutch(language)) { | |
1086 MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader); | |
1087 mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP); | |
1088 while (token != null) { | |
1089 token = mpdlNormalizerLex.yylex(); | |
1090 if (token != null) | |
1091 retStr += token; | |
1092 } | |
1093 } else if (Language.getInstance().isChinese(language)) { | |
1094 MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader); | |
1095 mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP); | |
1096 while (token != null) { | |
1097 token = mpdlNormalizerLex.yylex(); | |
1098 if (token != null) | |
1099 retStr += token; | |
1100 } | |
1101 } else { | |
1102 return normalize4Lexica(s, null); // old function | |
1103 } | |
1104 } catch (IOException e ) { | |
1105 // nothing cause IOException is not needed for a StringReader | |
1038 } | 1106 } |
1039 normStr = retStr; | 1107 return retStr; |
1040 return normStr; | |
1041 } | 1108 } |
1042 | 1109 |
1043 /* | 1110 /* |
1044 // explicit words | 1111 // explicit words |
1045 normStr = normStr.replaceAll("aliàs", "alias"); | 1112 normStr = normStr.replaceAll("aliàs", "alias"); |