Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 1ec29fdd0db8 |
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/analyzer/MpdlNormalizer.java Tue Feb 08 14:54:09 2011 +0100 @@ -1,23 +1,19 @@ package de.mpg.mpiwg.berlin.mpdl.lt.analyzer; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; +import java.io.StringReader; import java.util.ArrayList; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang.MpdlNormalizerLexAll; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.Regularization; import de.mpg.mpiwg.berlin.mpdl.lt.doc.regularization.RegularizationManager; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; public class MpdlNormalizer { - static final private String IT_VOWELS = "AEIOUaeiou" + - "\u00c6\u00e6" + // AE ligatures - "\u0152\u0153"; // OE ligatures - static final private String IT_CONS = "BCDFGHKLMNPQRSTVWXZ" + - "bcdfghklmnpqrstvwxz" + - "\u017f\u00df"; // long/sharp S + public static int MODE_4LEXICA = 1; // normalization for lexica etc. which have sometimes only ascii in it + public static int MODE_4HUMAN_READERS = 2; // normalization for human readers + private int normMode = MODE_4LEXICA; // Default private String[] normFunctionsToUse = {"reg", "norm"}; // default is to use all of these normalization functions private String language; private int[] offsets; @@ -32,6 +28,10 @@ this.language = language; } + public void setNormMode(int normMode) { + this.normMode = normMode; + } + /** * Applies the normalization rules in <code>language</code> to * <code>s</code>, without offset tracking. @@ -52,8 +52,11 @@ } } if (useNormFunction()) { - // normalize the string by string replace - normStr = normalize(normStr, null); + // normalize the string by string replacements + if (normMode == MODE_4LEXICA) + normStr = normalize4Lexica(normStr, null); + else if (normMode == MODE_4HUMAN_READERS) + normStr = normalize4HumanReaders(normStr); } return normStr; } @@ -92,7 +95,7 @@ * @param offsets character offset table * @return normalized string */ - public String normalize(String s, int[] offsets) { + private String normalize4Lexica(String s, int[] offsets) { this.offsets = offsets; if (language.equals("la") || language.equals("lat")) { StringBuffer buf = new StringBuffer(); @@ -479,9 +482,11 @@ case '\u00e4': replace = "ae"; break; case '\u00f6': replace = "oe"; break; case '\u00fc': replace = "ue"; break; + case '\u00ad': break; // soft hyphen case '\u00e9': replace = "e"; break; - case '\u00ad': break; // soft hyphen - case '-': break; + // new in MPDL project by J. Willenborg + case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S + // case '-': break; default: replace += c; break; } buf.append(replace); @@ -1007,16 +1012,126 @@ return s; } } - - /** - * Returns the offset table. - * - * @return offset table - */ - public int[] getOffsetTable() { - return offsets; + + private String normalize4HumanReaders(String s) { + String normStr = s; + StringReader strReader = new StringReader(normStr + "\n"); + MpdlNormalizerLexAll mpdlNormalizerLexAll = new MpdlNormalizerLexAll(strReader); + if (Language.getInstance().isLatin(language)) { + mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.LA); + } else if (Language.getInstance().isChinese(language)) { + mpdlNormalizerLexAll.yybegin(MpdlNormalizerLexAll.ZH); + } else { + // TODO normalization for all languages + return normalize4Lexica(s, null); // old function + } + String retStr = ""; + String token = ""; + while (token != null) { + try { + token = mpdlNormalizerLexAll.yylex(); + if (token != null) + retStr += token; + } catch (IOException e ) { + // nothing cause IOException is not needed for a StringReader + } + } + normStr = retStr; + return normStr; } + /* + // explicit words + normStr = normStr.replaceAll("aliàs", "alias"); + normStr = normStr.replaceAll("hîc", "hic"); + normStr = normStr.replaceAll("quòd", "quod"); + normStr = normStr.replaceAll("Quòd", "Quod"); + normStr = normStr.replaceAll("QVòd", "Quod"); + normStr = normStr.replaceAll("Cùmque", "Cumque"); + normStr = normStr.replaceAll("aër", "aer"); + // ij + normStr = normStr.replaceAll("ij", "ii"); + // qu/qv + normStr = normStr.replaceAll("qv", "qu"); + // normStr = normStr.replaceAll("qV", "qU"); + normStr = normStr.replaceAll("Qv", "Qu"); + normStr = normStr.replaceAll("QV", "QU"); + // u/v + String vowels = getVowels(); + String consonants = getConsonants(); + normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel + normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel + normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant + normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant + normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant + normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant + // end of word: diacritica + normStr = normStr.replaceAll("à$", "a"); + normStr = normStr.replaceAll("è$", "e"); + normStr = normStr.replaceAll("ò$", "o"); + normStr = normStr.replaceAll("àm$", "am"); + normStr = normStr.replaceAll("ùm$", "um"); + String normStrTmp = normStr; + normStr = ""; + for (int i = 0; i < normStrTmp.length(); i++) { + char c = normStrTmp.charAt(i); + String replace = ""; + switch (c) { + case 'ſ': replace = "s"; break; + case 'ß': replace = "ss"; break; + case 'æ': replace = "ae"; break; + case 'Æ': replace = "AE"; break; + case 'ę': replace = "ae"; break; + case 'œ': replace = "oe"; break; + default: replace += c; break; + } + normStr = normStr + replace; + } + + + private String getVowels() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "AEIOUaeiou" + + "\u00c6\u00e6" + // AE ligatures + "\u0152\u0153"; // OE ligatures + } else if (Language.getInstance().isLatin(language)) { + retStr = "AEIOUaeiouÆœęàèòù"; + } + // TODO all languages + return retStr; + } + + private String getConsonants() { + String retStr = null; + if (Language.getInstance().isItalian(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } else if (Language.getInstance().isLatin(language)) { + retStr = "BCDFGHKLMNPQRSTVWXZ" + + "bcdfghklmnpqrstvwxz" + + "ſß"; // long/sharp S + } + // TODO all languages + return retStr; + } + + + + + + * + * + * + * + */ + + + + + + /** * Returns a copy of an integer array with the element at * <code>index</code> removed ("killed"). @@ -1024,7 +1139,7 @@ * @param array integer array * @param index index of element to remove */ - static private int[] arrayKill(int[] array, int index) { + private int[] arrayKill(int[] array, int index) { int[] newArray = new int[array.length - 1]; System.arraycopy(array, 0, newArray, 0, index); System.arraycopy(array, index + 1, newArray, index, array.length - index - 1); @@ -1040,7 +1155,7 @@ * @param value value to insert into new slots * @param count number of new slots to insert */ - static private int[] arrayInsert(int[] array, int index, int value, int count) { + private int[] arrayInsert(int[] array, int index, int value, int count) { int[] newArray = new int[array.length + count]; System.arraycopy(array, 0, newArray, 0, index); for (int i = 0; i < count; i++) newArray[index + i] = value; @@ -1048,31 +1163,4 @@ return newArray; } - /** - * We provide <code>main()</code> so that our services will be available - * outside Java (i.e., so we can run as a Un*x-style filter). - */ - static public void main(String[] argv) throws ApplicationException { - if (argv.length != 1) { - System.err.println("You must specify a language."); - System.exit(1); - } - String rec; - StringBuffer buf = new StringBuffer(); - BufferedReader bin = null; - try { - bin = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); - while ((rec = bin.readLine()) != null) - buf.append(rec + "\n"); - } - catch (UnsupportedEncodingException e) { - System.err.println(e); - System.exit(1); - } catch (IOException e) { - System.err.println(e); - System.exit(1); - } - MpdlNormalizer orth = new MpdlNormalizer(argv[0]); - System.out.print(orth.normalize(buf.toString())); - } } \ No newline at end of file