comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children 1ec29fdd0db8
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; 1 package de.mpg.mpiwg.berlin.mpdl.lt.analyzer;
2 2
3 import java.io.BufferedReader;
4 import java.io.IOException; 3 import java.io.IOException;
5 import java.io.InputStreamReader; 4 import java.io.StringReader;
6 import java.io.UnsupportedEncodingException;
7 import java.util.ArrayList; 5 import java.util.ArrayList;
8 6
9 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 7 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
8 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll;
10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; 9 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization;
11 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; 10 import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager;
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; 11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
13 12
14 public class MpdlNormalizer { 13 public class MpdlNormalizer {
15 static final private String IT_VOWELS = "AEIOUaeiou" + 14 public static int MODE_4LEXICA = 1; // normalization for lexica etc. which have sometimes only ascii in it
16 "\u00c6\u00e6" + // AE ligatures 15 public static int MODE_4HUMAN_READERS = 2; // normalization for human readers
17 "\u0152\u0153"; // OE ligatures 16 private int normMode = MODE_4LEXICA; // Default
18 static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" +
19 "bcdfghklmnpqrstvwxz" +
20 "\u017f\u00df"; // long/sharp S
21 private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions 17 private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions
22 private String language; 18 private String language;
23 private int[] offsets; 19 private int[] offsets;
24 20
25 public MpdlNormalizer(String[] normFunctionsToUse, String lang) { 21 public MpdlNormalizer(String[] normFunctionsToUse, String lang) {
30 26
31 public MpdlNormalizer(String language) { 27 public MpdlNormalizer(String language) {
32 this.language = language; 28 this.language = language;
33 } 29 }
34 30
31 public void setNormMode(int normMode) {
32 this.normMode = normMode;
33 }
34
35 /** 35 /**
36 * Applies the normalization rules in <code>language</code> to 36 * Applies the normalization rules in <code>language</code> to
37 * <code>s</code>, without offset tracking. 37 * <code>s</code>, without offset tracking.
38 * 38 *
39 * @param s source string 39 * @param s source string
50 String regNormStr = reg.getNorm(); 50 String regNormStr = reg.getNorm();
51 normStr = regNormStr; 51 normStr = regNormStr;
52 } 52 }
53 } 53 }
54 if (useNormFunction()) { 54 if (useNormFunction()) {
55 // normalize the string by string replace 55 // normalize the string by string replacements
56 normStr = normalize(normStr, null); 56 if (normMode == MODE_4LEXICA)
57 normStr = normalize4Lexica(normStr, null);
58 else if (normMode == MODE_4HUMAN_READERS)
59 normStr = normalize4HumanReaders(normStr);
57 } 60 }
58 return normStr; 61 return normStr;
59 } 62 }
60 63
61 private boolean useRegFunction() { 64 private boolean useRegFunction() {
90 * 93 *
91 * @param s source string 94 * @param s source string
92 * @param offsets character offset table 95 * @param offsets character offset table
93 * @return normalized string 96 * @return normalized string
94 */ 97 */
95 public String normalize(String s, int[] offsets) { 98 private String normalize4Lexica(String s, int[] offsets) {
96 this.offsets = offsets; 99 this.offsets = offsets;
97 if (language.equals("la") || language.equals("lat")) { 100 if (language.equals("la") || language.equals("lat")) {
98 StringBuffer buf = new StringBuffer(); 101 StringBuffer buf = new StringBuffer();
99 int n = 0; 102 int n = 0;
100 for (int i = 0; i < s.length(); i++) { 103 for (int i = 0; i < s.length(); i++) {
477 case '\u00dc': replace = "Ue"; break; 480 case '\u00dc': replace = "Ue"; break;
478 case '\u00df': replace = "ss"; break; 481 case '\u00df': replace = "ss"; break;
479 case '\u00e4': replace = "ae"; break; 482 case '\u00e4': replace = "ae"; break;
480 case '\u00f6': replace = "oe"; break; 483 case '\u00f6': replace = "oe"; break;
481 case '\u00fc': replace = "ue"; break; 484 case '\u00fc': replace = "ue"; break;
485 case '\u00ad': break; // soft hyphen
482 case '\u00e9': replace = "e"; break; 486 case '\u00e9': replace = "e"; break;
483 case '\u00ad': break; // soft hyphen 487 // new in MPDL project by J. Willenborg
484 case '-': break; 488 case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
489 // case '-': break;
485 default: replace += c; break; 490 default: replace += c; break;
486 } 491 }
487 buf.append(replace); 492 buf.append(replace);
488 // update offsets if replacement is a different length 493 // update offsets if replacement is a different length
489 if (offsets != null) { 494 if (offsets != null) {
1005 return buf.toString(); 1010 return buf.toString();
1006 } else { // unknown or no language 1011 } else { // unknown or no language
1007 return s; 1012 return s;
1008 } 1013 }
1009 } 1014 }
1015
1016 private String normalize4HumanReaders(String s) {
1017 String normStr = s;
1018 StringReader strReader = new StringReader(normStr + "\n");
1019 MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader);
1020 if (Language.getInstance().isLatin(language)) {
1021 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA);
1022 } else if (Language.getInstance().isChinese(language)) {
1023 mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH);
1024 } else {
1025 // TODO normalization for all languages
1026 return normalize4Lexica(s, null); // old function
1027 }
1028 String retStr = "";
1029 String token = "";
1030 while (token != null) {
1031 try {
1032 token = mpdlNormalizerLexAll.yylex();
1033 if (token != null)
1034 retStr += token;
1035 } catch (IOException e ) {
1036 // nothing cause IOException is not needed for a StringReader
1037 }
1038 }
1039 normStr = retStr;
1040 return normStr;
1041 }
1042
1043 /*
1044 // explicit words
1045 normStr = normStr.replaceAll("aliàs", "alias");
1046 normStr = normStr.replaceAll("hîc", "hic");
1047 normStr = normStr.replaceAll("quòd", "quod");
1048 normStr = normStr.replaceAll("Quòd", "Quod");
1049 normStr = normStr.replaceAll("QVòd", "Quod");
1050 normStr = normStr.replaceAll("Cùmque", "Cumque");
1051 normStr = normStr.replaceAll("aër", "aer");
1052 // ij
1053 normStr = normStr.replaceAll("ij", "ii");
1054 // qu/qv
1055 normStr = normStr.replaceAll("qv", "qu");
1056 // normStr = normStr.replaceAll("qV", "qU");
1057 normStr = normStr.replaceAll("Qv", "Qu");
1058 normStr = normStr.replaceAll("QV", "QU");
1059 // u/v
1060 String vowels = getVowels();
1061 String consonants = getConsonants();
1062 normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel
1063 normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel
1064 normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant
1065 normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant
1066 normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant
1067 normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant
1068 // end of word: diacritica
1069 normStr = normStr.replaceAll("à$", "a");
1070 normStr = normStr.replaceAll("è$", "e");
1071 normStr = normStr.replaceAll("ò$", "o");
1072 normStr = normStr.replaceAll("àm$", "am");
1073 normStr = normStr.replaceAll("ùm$", "um");
1074 String normStrTmp = normStr;
1075 normStr = "";
1076 for (int i = 0; i < normStrTmp.length(); i++) {
1077 char c = normStrTmp.charAt(i);
1078 String replace = "";
1079 switch (c) {
1080 case 'ſ': replace = "s"; break;
1081 case 'ß': replace = "ss"; break;
1082 case 'æ': replace = "ae"; break;
1083 case 'Æ': replace = "AE"; break;
1084 case 'ę': replace = "ae"; break;
1085 case 'œ': replace = "oe"; break;
1086 default: replace += c; break;
1087 }
1088 normStr = normStr + replace;
1089 }
1090
1091
1092 private String getVowels() {
1093 String retStr = null;
1094 if (Language.getInstance().isItalian(language)) {
1095 retStr = "AEIOUaeiou" +
1096 "\u00c6\u00e6" + // AE ligatures
1097 "\u0152\u0153"; // OE ligatures
1098 } else if (Language.getInstance().isLatin(language)) {
1099 retStr = "AEIOUaeiouÆœęàèòù";
1100 }
1101 // TODO all languages
1102 return retStr;
1103 }
1104
1105 private String getConsonants() {
1106 String retStr = null;
1107 if (Language.getInstance().isItalian(language)) {
1108 retStr = "BCDFGHKLMNPQRSTVWXZ" +
1109 "bcdfghklmnpqrstvwxz" +
1110 "ſß"; // long/sharp S
1111 } else if (Language.getInstance().isLatin(language)) {
1112 retStr = "BCDFGHKLMNPQRSTVWXZ" +
1113 "bcdfghklmnpqrstvwxz" +
1114 "ſß"; // long/sharp S
1115 }
1116 // TODO all languages
1117 return retStr;
1118 }
1119
1120
1121
1122
1123
1124 *
1125 *
1126 *
1127 *
1128 */
1129
1010 1130
1011 /** 1131
1012 * Returns the offset table. 1132
1013 * 1133
1014 * @return offset table 1134
1015 */
1016 public int[] getOffsetTable() {
1017 return offsets;
1018 }
1019
1020 /** 1135 /**
1021 * Returns a copy of an integer array with the element at 1136 * Returns a copy of an integer array with the element at
1022 * <code>index</code> removed ("killed"). 1137 * <code>index</code> removed ("killed").
1023 * 1138 *
1024 * @param array integer array 1139 * @param array integer array
1025 * @param index index of element to remove 1140 * @param index index of element to remove
1026 */ 1141 */
1027 static private int[] arrayKill(int[] array, int index) { 1142 private int[] arrayKill(int[] array, int index) {
1028 int[] newArray = new int[array.length - 1]; 1143 int[] newArray = new int[array.length - 1];
1029 System.arraycopy(array, 0, newArray, 0, index); 1144 System.arraycopy(array, 0, newArray, 0, index);
1030 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); 1145 System.arraycopy(array, index + 1, newArray, index, array.length - index - 1);
1031 return newArray; 1146 return newArray;
1032 } 1147 }
1038 * @param array integer array 1153 * @param array integer array
1039 * @param index index to insert new elements 1154 * @param index index to insert new elements
1040 * @param value value to insert into new slots 1155 * @param value value to insert into new slots
1041 * @param count number of new slots to insert 1156 * @param count number of new slots to insert
1042 */ 1157 */
1043 static private int[] arrayInsert(int[] array, int index, int value, int count) { 1158 private int[] arrayInsert(int[] array, int index, int value, int count) {
1044 int[] newArray = new int[array.length + count]; 1159 int[] newArray = new int[array.length + count];
1045 System.arraycopy(array, 0, newArray, 0, index); 1160 System.arraycopy(array, 0, newArray, 0, index);
1046 for (int i = 0; i < count; i++) newArray[index + i] = value; 1161 for (int i = 0; i < count; i++) newArray[index + i] = value;
1047 System.arraycopy(array, index, newArray, index + count, array.length - index); 1162 System.arraycopy(array, index, newArray, index + count, array.length - index);
1048 return newArray; 1163 return newArray;
1049 } 1164 }
1050 1165
1051 /**
1052 * We provide <code>main()</code> so that our services will be available
1053 * outside Java (i.e., so we can run as a Un*x-style filter).
1054 */
1055 static public void main(String[] argv) throws ApplicationException {
1056 if (argv.length != 1) {
1057 System.err.println("You must specify a language.");
1058 System.exit(1);
1059 }
1060 String rec;
1061 StringBuffer buf = new StringBuffer();
1062 BufferedReader bin = null;
1063 try {
1064 bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
1065 while ((rec = bin.readLine()) != null)
1066 buf.append(rec + "\n");
1067 }
1068 catch (UnsupportedEncodingException e) {
1069 System.err.println(e);
1070 System.exit(1);
1071 } catch (IOException e) {
1072 System.err.println(e);
1073 System.exit(1);
1074 }
1075 MpdlNormalizer orth = new MpdlNormalizer(argv[0]);
1076 System.out.print(orth.normalize(buf.toString()));
1077 }
1078 } 1166 }