comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 9:1ec29fdd0db8

neue .lex Dateien f?r Normalisierung / externe Objekte update
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 22 Feb 2011 16:03:45 +0100
parents 2396a569e446
children 5df60f24e997
comparison
equal deleted inserted replaced
8:d2a1c14fde31 9:1ec29fdd0db8
3 import java.io.IOException; 3 import java.io.IOException;
4 import java.io.StringReader; 4 import java.io.StringReader;
5 import java.util.ArrayList; 5 import java.util.ArrayList;
6 6
7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; 8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAR;
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexDE;
10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEL;
11 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexEN;
12 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexFR;
13 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexIT;
14 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexLA;
15 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexNL;
16 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexZH;
9 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; 17 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization;
10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; 18 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager;
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; 19 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
12 20
13 public class MpdlNormalizer { 21 public class MpdlNormalizer {
1012 return s; 1020 return s;
1013 } 1021 }
1014 } 1022 }
1015 1023
1016 private String normalize4HumanReaders(String s) { 1024 private String normalize4HumanReaders(String s) {
1017 String normStr = s; 1025 StringReader strReader = new StringReader(s + "\n");
1018 StringReader strReader = new StringReader(normStr + "\n");
1019 MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader);
1020 if (Language.getInstance().isLatin(language)) {
1021 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA);
1022 } else if (Language.getInstance().isChinese(language)) {
1023 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH);
1024 } else {
1025 // TODO normalization for all languages
1026 return normalize4Lexica(s, null); // old function
1027 }
1028 String retStr = ""; 1026 String retStr = "";
1029 String token = ""; 1027 String token = "";
1030 while (token != null) { 1028 try {
1031 try { 1029 if (Language.getInstance().isLatin(language)) {
1032 token = mpdlNormalizerLexAll.yylex(); 1030 MpdlNormalizerLexLA mpdlNormalizerLex = new MpdlNormalizerLexLA(strReader);
1033 if (token != null) 1031 mpdlNormalizerLex.yybegin(MpdlNormalizerLexLA.DISP);
1034 retStr += token; 1032 while (token != null) {
1035 } catch (IOException e ) { 1033 token = mpdlNormalizerLex.yylex();
1036 // nothing cause IOException is not needed for a StringReader 1034 if (token != null)
1037 } 1035 retStr += token;
1036 }
1037 } else if (Language.getInstance().isArabic(language)) {
1038 MpdlNormalizerLexAR mpdlNormalizerLex = new MpdlNormalizerLexAR(strReader);
1039 mpdlNormalizerLex.yybegin(MpdlNormalizerLexAR.DISP);
1040 while (token != null) {
1041 token = mpdlNormalizerLex.yylex();
1042 if (token != null)
1043 retStr += token;
1044 }
1045 } else if (Language.getInstance().isGerman(language)) {
1046 MpdlNormalizerLexDE mpdlNormalizerLex = new MpdlNormalizerLexDE(strReader);
1047 mpdlNormalizerLex.yybegin(MpdlNormalizerLexDE.DISP);
1048 while (token != null) {
1049 token = mpdlNormalizerLex.yylex();
1050 if (token != null)
1051 retStr += token;
1052 }
1053 } else if (Language.getInstance().isGreek(language)) {
1054 MpdlNormalizerLexEL mpdlNormalizerLex = new MpdlNormalizerLexEL(strReader);
1055 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEL.DISP);
1056 while (token != null) {
1057 token = mpdlNormalizerLex.yylex();
1058 if (token != null)
1059 retStr += token;
1060 }
1061 } else if (Language.getInstance().isEnglish(language)) {
1062 MpdlNormalizerLexEN mpdlNormalizerLex = new MpdlNormalizerLexEN(strReader);
1063 mpdlNormalizerLex.yybegin(MpdlNormalizerLexEN.DISP);
1064 while (token != null) {
1065 token = mpdlNormalizerLex.yylex();
1066 if (token != null)
1067 retStr += token;
1068 }
1069 } else if (Language.getInstance().isFrench(language)) {
1070 MpdlNormalizerLexFR mpdlNormalizerLex = new MpdlNormalizerLexFR(strReader);
1071 mpdlNormalizerLex.yybegin(MpdlNormalizerLexFR.DISP);
1072 while (token != null) {
1073 token = mpdlNormalizerLex.yylex();
1074 if (token != null)
1075 retStr += token;
1076 }
1077 } else if (Language.getInstance().isItalian(language)) {
1078 MpdlNormalizerLexIT mpdlNormalizerLex = new MpdlNormalizerLexIT(strReader);
1079 mpdlNormalizerLex.yybegin(MpdlNormalizerLexIT.DISP);
1080 while (token != null) {
1081 token = mpdlNormalizerLex.yylex();
1082 if (token != null)
1083 retStr += token;
1084 }
1085 } else if (Language.getInstance().isDutch(language)) {
1086 MpdlNormalizerLexNL mpdlNormalizerLex = new MpdlNormalizerLexNL(strReader);
1087 mpdlNormalizerLex.yybegin(MpdlNormalizerLexNL.DISP);
1088 while (token != null) {
1089 token = mpdlNormalizerLex.yylex();
1090 if (token != null)
1091 retStr += token;
1092 }
1093 } else if (Language.getInstance().isChinese(language)) {
1094 MpdlNormalizerLexZH mpdlNormalizerLex = new MpdlNormalizerLexZH(strReader);
1095 mpdlNormalizerLex.yybegin(MpdlNormalizerLexZH.DISP);
1096 while (token != null) {
1097 token = mpdlNormalizerLex.yylex();
1098 if (token != null)
1099 retStr += token;
1100 }
1101 } else {
1102 return normalize4Lexica(s, null); // old function
1103 }
1104 } catch (IOException e ) {
1105 // nothing cause IOException is not needed for a StringReader
1038 } 1106 }
1039 normStr = retStr; 1107 return retStr;
1040 return normStr;
1041 } 1108 }
1042 1109
1043 /* 1110 /*
1044 // explicit words 1111 // explicit words
1045 normStr = normStr.replaceAll("aliàs", "alias"); 1112 normStr = normStr.replaceAll("aliàs", "alias");