Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 1ec29fdd0db8 |
comparison
equal
deleted
inserted
replaced
5:94305c504178 | 6:2396a569e446 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; | 1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; |
2 | 2 |
3 import java.io.BufferedReader; | |
4 import java.io.IOException; | 3 import java.io.IOException; |
5 import java.io.InputStreamReader; | 4 import java.io.StringReader; |
6 import java.io.UnsupportedEncodingException; | |
7 import java.util.ArrayList; | 5 import java.util.ArrayList; |
8 | 6 |
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; | |
10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; | 9 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; |
11 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; | 10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; |
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | 11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; |
13 | 12 |
14 public class MpdlNormalizer { | 13 public class MpdlNormalizer { |
15 static final private String IT_VOWELS = "AEIOUaeiou" + | 14 public static int MODE_4LEXICA = 1; // normalization for lexica etc. which have sometimes only ascii in it |
16 "\u00c6\u00e6" + // AE ligatures | 15 public static int MODE_4HUMAN_READERS = 2; // normalization for human readers |
17 "\u0152\u0153"; // OE ligatures | 16 private int normMode = MODE_4LEXICA; // Default |
18 static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" + | |
19 "bcdfghklmnpqrstvwxz" + | |
20 "\u017f\u00df"; // long/sharp S | |
21 private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions | 17 private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions |
22 private String language; | 18 private String language; |
23 private int[] offsets; | 19 private int[] offsets; |
24 | 20 |
25 public MpdlNormalizer(String[] normFunctionsToUse, String lang) { | 21 public MpdlNormalizer(String[] normFunctionsToUse, String lang) { |
30 | 26 |
31 public MpdlNormalizer(String language) { | 27 public MpdlNormalizer(String language) { |
32 this.language = language; | 28 this.language = language; |
33 } | 29 } |
34 | 30 |
31 public void setNormMode(int normMode) { | |
32 this.normMode = normMode; | |
33 } | |
34 | |
35 /** | 35 /** |
36 * Applies the normalization rules in <code>language</code> to | 36 * Applies the normalization rules in <code>language</code> to |
37 * <code>s</code>, without offset tracking. | 37 * <code>s</code>, without offset tracking. |
38 * | 38 * |
39 * @param s source string | 39 * @param s source string |
50 String regNormStr = reg.getNorm(); | 50 String regNormStr = reg.getNorm(); |
51 normStr = regNormStr; | 51 normStr = regNormStr; |
52 } | 52 } |
53 } | 53 } |
54 if (useNormFunction()) { | 54 if (useNormFunction()) { |
55 // normalize the string by string replace | 55 // normalize the string by string replacements |
56 normStr = normalize(normStr, null); | 56 if (normMode == MODE_4LEXICA) |
57 normStr = normalize4Lexica(normStr, null); | |
58 else if (normMode == MODE_4HUMAN_READERS) | |
59 normStr = normalize4HumanReaders(normStr); | |
57 } | 60 } |
58 return normStr; | 61 return normStr; |
59 } | 62 } |
60 | 63 |
61 private boolean useRegFunction() { | 64 private boolean useRegFunction() { |
90 * | 93 * |
91 * @param s source string | 94 * @param s source string |
92 * @param offsets character offset table | 95 * @param offsets character offset table |
93 * @return normalized string | 96 * @return normalized string |
94 */ | 97 */ |
95 public String normalize(String s, int[] offsets) { | 98 private String normalize4Lexica(String s, int[] offsets) { |
96 this.offsets = offsets; | 99 this.offsets = offsets; |
97 if (language.equals("la") || language.equals("lat")) { | 100 if (language.equals("la") || language.equals("lat")) { |
98 StringBuffer buf = new StringBuffer(); | 101 StringBuffer buf = new StringBuffer(); |
99 int n = 0; | 102 int n = 0; |
100 for (int i = 0; i < s.length(); i++) { | 103 for (int i = 0; i < s.length(); i++) { |
477 case '\u00dc': replace = "Ue"; break; | 480 case '\u00dc': replace = "Ue"; break; |
478 case '\u00df': replace = "ss"; break; | 481 case '\u00df': replace = "ss"; break; |
479 case '\u00e4': replace = "ae"; break; | 482 case '\u00e4': replace = "ae"; break; |
480 case '\u00f6': replace = "oe"; break; | 483 case '\u00f6': replace = "oe"; break; |
481 case '\u00fc': replace = "ue"; break; | 484 case '\u00fc': replace = "ue"; break; |
485 case '\u00ad': break; // soft hyphen | |
482 case '\u00e9': replace = "e"; break; | 486 case '\u00e9': replace = "e"; break; |
483 case '\u00ad': break; // soft hyphen | 487 // new in MPDL project by J. Willenborg |
484 case '-': break; | 488 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S |
489 // case '-': break; | |
485 default: replace += c; break; | 490 default: replace += c; break; |
486 } | 491 } |
487 buf.append(replace); | 492 buf.append(replace); |
488 // update offsets if replacement is a different length | 493 // update offsets if replacement is a different length |
489 if (offsets != null) { | 494 if (offsets != null) { |
1005 return buf.toString(); | 1010 return buf.toString(); |
1006 } else { // unknown or no language | 1011 } else { // unknown or no language |
1007 return s; | 1012 return s; |
1008 } | 1013 } |
1009 } | 1014 } |
1015 | |
1016 private String normalize4HumanReaders(String s) { | |
1017 String normStr = s; | |
1018 StringReader strReader = new StringReader(normStr + "\n"); | |
1019 MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); | |
1020 if (Language.getInstance().isLatin(language)) { | |
1021 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); | |
1022 } else if (Language.getInstance().isChinese(language)) { | |
1023 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); | |
1024 } else { | |
1025 // TODO normalization for all languages | |
1026 return normalize4Lexica(s, null); // old function | |
1027 } | |
1028 String retStr = ""; | |
1029 String token = ""; | |
1030 while (token != null) { | |
1031 try { | |
1032 token = mpdlNormalizerLexAll.yylex(); | |
1033 if (token != null) | |
1034 retStr += token; | |
1035 } catch (IOException e ) { | |
1036 // nothing cause IOException is not needed for a StringReader | |
1037 } | |
1038 } | |
1039 normStr = retStr; | |
1040 return normStr; | |
1041 } | |
1042 | |
1043 /* | |
1044 // explicit words | |
1045 normStr = normStr.replaceAll("aliàs", "alias"); | |
1046 normStr = normStr.replaceAll("hîc", "hic"); | |
1047 normStr = normStr.replaceAll("quòd", "quod"); | |
1048 normStr = normStr.replaceAll("Quòd", "Quod"); | |
1049 normStr = normStr.replaceAll("QVòd", "Quod"); | |
1050 normStr = normStr.replaceAll("Cùmque", "Cumque"); | |
1051 normStr = normStr.replaceAll("aër", "aer"); | |
1052 // ij | |
1053 normStr = normStr.replaceAll("ij", "ii"); | |
1054 // qu/qv | |
1055 normStr = normStr.replaceAll("qv", "qu"); | |
1056 // normStr = normStr.replaceAll("qV", "qU"); | |
1057 normStr = normStr.replaceAll("Qv", "Qu"); | |
1058 normStr = normStr.replaceAll("QV", "QU"); | |
1059 // u/v | |
1060 String vowels = getVowels(); | |
1061 String consonants = getConsonants(); | |
1062 normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel | |
1063 normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel | |
1064 normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant | |
1065 normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant | |
1066 normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant | |
1067 normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant | |
1068 // end of word: diacritica | |
1069 normStr = normStr.replaceAll("à$", "a"); | |
1070 normStr = normStr.replaceAll("è$", "e"); | |
1071 normStr = normStr.replaceAll("ò$", "o"); | |
1072 normStr = normStr.replaceAll("àm$", "am"); | |
1073 normStr = normStr.replaceAll("ùm$", "um"); | |
1074 String normStrTmp = normStr; | |
1075 normStr = ""; | |
1076 for (int i = 0; i < normStrTmp.length(); i++) { | |
1077 char c = normStrTmp.charAt(i); | |
1078 String replace = ""; | |
1079 switch (c) { | |
1080 case 'ſ': replace = "s"; break; | |
1081 case 'ß': replace = "ss"; break; | |
1082 case 'æ': replace = "ae"; break; | |
1083 case 'Æ': replace = "AE"; break; | |
1084 case 'ę': replace = "ae"; break; | |
1085 case 'œ': replace = "oe"; break; | |
1086 default: replace += c; break; | |
1087 } | |
1088 normStr = normStr + replace; | |
1089 } | |
1090 | |
1091 | |
1092 private String getVowels() { | |
1093 String retStr = null; | |
1094 if (Language.getInstance().isItalian(language)) { | |
1095 retStr = "AEIOUaeiou" + | |
1096 "\u00c6\u00e6" + // AE ligatures | |
1097 "\u0152\u0153"; // OE ligatures | |
1098 } else if (Language.getInstance().isLatin(language)) { | |
1099 retStr = "AEIOUaeiouÆœęàèòù"; | |
1100 } | |
1101 // TODO all languages | |
1102 return retStr; | |
1103 } | |
1104 | |
1105 private String getConsonants() { | |
1106 String retStr = null; | |
1107 if (Language.getInstance().isItalian(language)) { | |
1108 retStr = "BCDFGHKLMNPQRSTVWXZ" + | |
1109 "bcdfghklmnpqrstvwxz" + | |
1110 "ſß"; // long/sharp S | |
1111 } else if (Language.getInstance().isLatin(language)) { | |
1112 retStr = "BCDFGHKLMNPQRSTVWXZ" + | |
1113 "bcdfghklmnpqrstvwxz" + | |
1114 "ſß"; // long/sharp S | |
1115 } | |
1116 // TODO all languages | |
1117 return retStr; | |
1118 } | |
1119 | |
1120 | |
1121 | |
1122 | |
1123 | |
1124 * | |
1125 * | |
1126 * | |
1127 * | |
1128 */ | |
1129 | |
1010 | 1130 |
1011 /** | 1131 |
1012 * Returns the offset table. | 1132 |
1013 * | 1133 |
1014 * @return offset table | 1134 |
1015 */ | |
1016 public int[] getOffsetTable() { | |
1017 return offsets; | |
1018 } | |
1019 | |
1020 /** | 1135 /** |
1021 * Returns a copy of an integer array with the element at | 1136 * Returns a copy of an integer array with the element at |
1022 * <code>index</code> removed ("killed"). | 1137 * <code>index</code> removed ("killed"). |
1023 * | 1138 * |
1024 * @param array integer array | 1139 * @param array integer array |
1025 * @param index index of element to remove | 1140 * @param index index of element to remove |
1026 */ | 1141 */ |
1027 static private int[] arrayKill(int[] array, int index) { | 1142 private int[] arrayKill(int[] array, int index) { |
1028 int[] newArray = new int[array.length - 1]; | 1143 int[] newArray = new int[array.length - 1]; |
1029 System.arraycopy(array, 0, newArray, 0, index); | 1144 System.arraycopy(array, 0, newArray, 0, index); |
1030 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); | 1145 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); |
1031 return newArray; | 1146 return newArray; |
1032 } | 1147 } |
1038 * @param array integer array | 1153 * @param array integer array |
1039 * @param index index to insert new elements | 1154 * @param index index to insert new elements |
1040 * @param value value to insert into new slots | 1155 * @param value value to insert into new slots |
1041 * @param count number of new slots to insert | 1156 * @param count number of new slots to insert |
1042 */ | 1157 */ |
1043 static private int[] arrayInsert(int[] array, int index, int value, int count) { | 1158 private int[] arrayInsert(int[] array, int index, int value, int count) { |
1044 int[] newArray = new int[array.length + count]; | 1159 int[] newArray = new int[array.length + count]; |
1045 System.arraycopy(array, 0, newArray, 0, index); | 1160 System.arraycopy(array, 0, newArray, 0, index); |
1046 for (int i = 0; i < count; i++) newArray[index + i] = value; | 1161 for (int i = 0; i < count; i++) newArray[index + i] = value; |
1047 System.arraycopy(array, index, newArray, index + count, array.length - index); | 1162 System.arraycopy(array, index, newArray, index + count, array.length - index); |
1048 return newArray; | 1163 return newArray; |
1049 } | 1164 } |
1050 | 1165 |
1051 /** | |
1052 * We provide <code>main()</code> so that our services will be available | |
1053 * outside Java (i.e., so we can run as a Un*x-style filter). | |
1054 */ | |
1055 static public void main(String[] argv) throws ApplicationException { | |
1056 if (argv.length != 1) { | |
1057 System.err.println("You must specify a language."); | |
1058 System.exit(1); | |
1059 } | |
1060 String rec; | |
1061 StringBuffer buf = new StringBuffer(); | |
1062 BufferedReader bin = null; | |
1063 try { | |
1064 bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); | |
1065 while ((rec = bin.readLine()) != null) | |
1066 buf.append(rec + "\n"); | |
1067 } | |
1068 catch (UnsupportedEncodingException e) { | |
1069 System.err.println(e); | |
1070 System.exit(1); | |
1071 } catch (IOException e) { | |
1072 System.err.println(e); | |
1073 System.exit(1); | |
1074 } | |
1075 MpdlNormalizer orth = new MpdlNormalizer(argv[0]); | |
1076 System.out.print(orth.normalize(buf.toString())); | |
1077 } | |
1078 } | 1166 } |