language to
+ * s
, with offset tracking.
+ *
+ * WARNING:
+ * Arboreal will not work properly if a normalization substitution
+ * replaces a source character with more than two target characters!
+ * This is simply a BUG, and should be fixed. Fortunately, however,
+ * one does not often need such a replacement.
+ *
+ * @param s source string
+ * @param offsets character offset table
+ * @return normalized string
+ */
+ private String normalize4Lexica(String s, int[] offsets) {
+ this.offsets = offsets;
+ if (language.equals("la") || language.equals("lat")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case 'j': replace = "i"; break;
+ case 'v': replace = "u"; break;
+ /*
+ * Linguistic note: /u/ and /v/ are rarely phonemic
+ * in Latin, as in alui 's/he nourished' vs.
+ * alvi 'of a belly', volui 's/he wished' or 'it rolled'
+ * vs. volvi 'to be rolled', (in)seruit 's/he joined
+ * together' vs. (in)servit 's/he serves'.
+ */
+ case 'q':
+ if ((i < s.length() - 1) && (s.charAt(i + 1) == ';'))
+ replace = "qu";
+ else
+ replace = "q";
+ break;
+ case ';':
+ if ((i > 0) && (s.charAt(i - 1) == 'q'))
+ replace = "e";
+ else if ((i == 0) || ! Character.isLetter(s.charAt(i - 1)))
+ replace = ";";
+ else
+ replace = "";
+ break;
+ case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT
+ case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT
+ case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT
+
+ case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE
+ case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE
+ case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX
+ case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS
+ case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E
+ case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA
+ case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE
+ case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE
+ case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX
+ case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS
+ case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE;
+ case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE
+ case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX
+ case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS
+ case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE
+ case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE
+ case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX
+ case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS
+ case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE
+ case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE
+ case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX
+ case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS
+ case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE
+ case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE
+ case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX
+ case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS
+ case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E
+ case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA
+ case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE
+ case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE
+ case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX
+ case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS
+ case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE
+ case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE
+ case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX
+ case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS
+ case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE
+ case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE
+ case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX
+ case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS
+ case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE
+ case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE
+ case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX
+ case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS
+ case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON
+ case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON
+ case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE
+ case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE
+ case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON
+ case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON
+ case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE
+ case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE
+ case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK
+ case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK
+ case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON
+ case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON
+ case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE
+ case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE
+ case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON
+ case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON
+ case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE
+ case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE
+ case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E
+ case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E
+ case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON
+ case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON
+ case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE
+ case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S
+ case '\u00ad': break; // soft hyphen
+ // new in MPDL project by J. Willenborg
+ case '\u1e14': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e15': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e16': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e17': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e18': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e19': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e1a': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e1b': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e1c': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1e1d': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1eb8': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1eb9': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1eba': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ebb': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ebc': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ebd': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ebe': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ebf': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec0': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec1': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec2': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec3': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec4': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec5': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec6': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ case '\u1ec7': replace = "e"; break; // LATIN ... LETTER E WITH ...
+ // by Malcolm
+ case '\u2329': break; // BRA
+ case '\u232a': break; // KET
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("it")) {
+ // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away, also all latin stuff is imported
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE
+ case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE
+ case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX
+ case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS
+ case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E
+ case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA
+ case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE
+ case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE
+ case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX
+ case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS
+ case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE;
+ case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE
+ case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX
+ case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS
+ case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE
+ case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE
+ case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX
+ case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS
+ case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE
+ case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE
+ case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX
+ case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS
+ case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE
+ case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE
+ case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX
+ case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS
+ case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E
+ case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA
+ case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE
+ case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE
+ case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX
+ case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS
+ case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE
+ case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE
+ case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX
+ case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS
+ case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE
+ case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE
+ case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX
+ case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS
+ case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE
+ case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE
+ case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX
+ case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS
+ case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON
+ case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON
+ case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE
+ case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE
+ case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON
+ case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON
+ case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE
+ case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE
+ case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK
+ case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK
+ case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON
+ case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON
+ case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE
+ case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE
+ case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON
+ case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON
+ case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE
+ case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE
+ case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E
+ case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E
+ case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON
+ case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON
+ case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE
+ case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S
+ // new in MPDL project by J. Willenborg
+ case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0) this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ // new Mpdl code: added by J. Willenborg: most of the latin replacements also in english
+ } else if (language.equals("en")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u0300': replace = ""; break; // COMBINING GRAVE ACCENT
+ case '\u0301': replace = ""; break; // COMBINING ACCUTE ACCENT
+ case '\u0302': replace = ""; break; // COMBINING CIRCUMFLEX ACCENT
+
+ case '\u00c0': replace = "A"; break; // LATIN CAPITAL LETTER A GRAVE
+ case '\u00c1': replace = "A"; break; // LATIN CAPITAL LETTER A ACUTE
+ case '\u00c2': replace = "A"; break; // LATIN CAPITAL LETTER A CIRCUMFLEX
+ case '\u00c4': replace = "A"; break; // LATIN CAPITAL LETTER A DIAERESIS
+ case '\u00c6': replace = "Ae"; break; // LATIN CAPITAL LETTER A E
+ case '\u00c7': replace = "C"; break; // LATIN CAPITAL LETTER C CEDILLA
+ case '\u00c8': replace = "E"; break; // LATIN CAPITAL LETTER E GRAVE
+ case '\u00c9': replace = "E"; break; // LATIN CAPITAL LETTER E ACUTE
+ case '\u00ca': replace = "E"; break; // LATIN CAPITAL LETTER E CIRCUMFLEX
+ case '\u00cb': replace = "E"; break; // LATIN CAPITAL LETTER E DIAERESIS
+ case '\u00cc': replace = "I"; break; // LATIN CAPITAL LETTER I GRAVE;
+ case '\u00cd': replace = "I"; break; // LATIN CAPITAL LETTER I ACUTE
+ case '\u00ce': replace = "I"; break; // LATIN CAPITAL LETTER I CIRCUMFLEX
+ case '\u00cf': replace = "I"; break; // LATIN CAPITAL LETTER I DIAERESIS
+ case '\u00d2': replace = "O"; break; // LATIN CAPITAL LETTER O GRAVE
+ case '\u00d3': replace = "O"; break; // LATIN CAPITAL LETTER O ACUTE
+ case '\u00d4': replace = "O"; break; // LATIN CAPITAL LETTER O CIRCUMFLEX
+ case '\u00d6': replace = "O"; break; // LATIN CAPITAL LETTER O DIAERESIS
+ case '\u00d9': replace = "U"; break; // LATIN CAPITAL LETTER U GRAVE
+ case '\u00da': replace = "U"; break; // LATIN CAPITAL LETTER U ACUTE
+ case '\u00db': replace = "U"; break; // LATIN CAPITAL LETTER U CIRCUMFLEX
+ case '\u00dc': replace = "U"; break; // LATIN CAPITAL LETTER U DIAERESIS
+ case '\u00e0': replace = "a"; break; // LATIN SMALL LETTER A GRAVE
+ case '\u00e1': replace = "a"; break; // LATIN SMALL LETTER A ACUTE
+ case '\u00e2': replace = "a"; break; // LATIN SMALL LETTER A CIRCUMFLEX
+ case '\u00e4': replace = "a"; break; // LATIN SMALL LETTER A DIAERESIS
+ case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E
+ case '\u00e7': replace = "c"; break; // LATIN SMALL LETTER C CEDILLA
+ case '\u00e8': replace = "e"; break; // LATIN SMALL LETTER E GRAVE
+ case '\u00e9': replace = "e"; break; // LATIN SMALL LETTER E ACUTE
+ case '\u00ea': replace = "e"; break; // LATIN SMALL LETTER E CIRCUMFLEX
+ case '\u00eb': replace = "e"; break; // LATIN SMALL LETTER E DIAERESIS
+ case '\u00ec': replace = "i"; break; // LATIN SMALL LETTER I GRAVE
+ case '\u00ed': replace = "i"; break; // LATIN SMALL LETTER I ACUTE
+ case '\u00ee': replace = "i"; break; // LATIN SMALL LETTER I CIRCUMFLEX
+ case '\u00ef': replace = "i"; break; // LATIN SMALL LETTER I DIAERESIS
+ case '\u00f2': replace = "o"; break; // LATIN SMALL LETTER O GRAVE
+ case '\u00f3': replace = "o"; break; // LATIN SMALL LETTER O ACUTE
+ case '\u00f4': replace = "o"; break; // LATIN SMALL LETTER O CIRCUMFLEX
+ case '\u00f6': replace = "o"; break; // LATIN SMALL LETTER O DIAERESIS
+ case '\u00f9': replace = "u"; break; // LATIN SMALL LETTER U GRAVE
+ case '\u00fa': replace = "u"; break; // LATIN SMALL LETTER U ACUTE
+ case '\u00fb': replace = "u"; break; // LATIN SMALL LETTER U CIRCUMFLEX
+ case '\u00fc': replace = "u"; break; // LATIN SMALL LETTER U DIAERESIS
+ case '\u0100': replace = "A"; break; // LATIN CAPITAL LETTER A MACRON
+ case '\u0101': replace = "a"; break; // LATIN SMALL LETTER A MACRON
+ case '\u0102': replace = "A"; break; // LATIN CAPITAL LETTER A BREVE
+ case '\u0103': replace = "a"; break; // LATIN SMALL LETTER A BREVE
+ case '\u0112': replace = "E"; break; // LATIN CAPITAL LETTER E MACRON
+ case '\u0113': replace = "e"; break; // LATIN SMALL LETTER E MACRON
+ case '\u0114': replace = "E"; break; // LATIN CAPITAL LETTER E BREVE
+ case '\u0115': replace = "e"; break; // LATIN SMALL LETTER E BREVE
+ case '\u0118': replace = "Ae"; break; // LATIN CAPITAL LETTER E OGONEK
+ case '\u0119': replace = "ae"; break; // LATIN SMALL LETTER E OGONEK
+ case '\u012a': replace = "I"; break; // LATIN CAPITAL LETTER I MACRON
+ case '\u012b': replace = "i"; break; // LATIN SMALL LETTER I MACRON
+ case '\u012c': replace = "I"; break; // LATIN CAPITAL LETTER I BREVE
+ case '\u012d': replace = "i"; break; // LATIN SMALL LETTER I BREVE
+ case '\u014c': replace = "O"; break; // LATIN CAPITAL LETTER O MACRON
+ case '\u014d': replace = "o"; break; // LATIN SMALL LETTER O MACRON
+ case '\u014e': replace = "O"; break; // LATIN CAPITAL LETTER O BREVE
+ case '\u014f': replace = "o"; break; // LATIN SMALL LETTER O BREVE
+ case '\u0152': replace = "Oe"; break; // LATIN CAPITAL LETTER O E
+ case '\u0153': replace = "oe"; break; // LATIN SMALL LETTER O E
+ case '\u016a': replace = "U"; break; // LATIN CAPITAL LETTER U MACRON
+ case '\u016b': replace = "u"; break; // LATIN SMALL LETTER U MACRON
+ case '\u016c': replace = "U"; break; // LATIN CAPITAL LETTER U BREVE
+ case '\u016d': replace = "u"; break; // LATIN SMALL LETTER U BREVE
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S
+ // new in MPDL project by J. Willenborg
+ case '\u1e8d': replace = "e"; break; // LATIN SMALL LETTER E WITH TILDE
+ // by Malcolm
+ case '\u00ad': break; // soft hyphen
+ case '\u2329': break; // BRA
+ case '\u232a': break; // KET
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("fr")) {
+ // new Mpdl code: added by J. Willenborg: some of Malcolms code did not work without errors so it has to be taken away
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u00e6': replace = "ae"; break; // LATIN SMALL LETTER A E
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ case '\u00df': replace = "ss"; break; // LATIN SMALL LETTER SHARP S
+ case '\u00ad': break; // soft hyphen
+ case '-': break;
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("de")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u00c4': replace = "Ae"; break;
+ case '\u00d6': replace = "Oe"; break;
+ case '\u00dc': replace = "Ue"; break;
+ case '\u00df': replace = "ss"; break;
+ case '\u00e4': replace = "ae"; break;
+ case '\u00f6': replace = "oe"; break;
+ case '\u00fc': replace = "ue"; break;
+ case '\u00ad': break; // soft hyphen
+ case '\u00e9': replace = "e"; break;
+ // new in MPDL project by J. Willenborg
+ case '\u017f': replace = "s"; break; // LATIN SMALL LETTER LONG S
+ // case '-': break;
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("zh")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u00b9': replace = "1"; break;
+ case '\u00b2': replace = "2"; break;
+ case '\u00b3': replace = "3"; break;
+ case '\u2074': replace = "4"; break;
+ case '\u2075': replace = "5"; break;
+ // original by Malcolm Hyman: with the following replacements
+ // case '\u3000': replace = " "; break;
+ // case '\u3001': replace = ","; break;
+ // case '\u3002': replace = "."; break;
+ // case '\u200b': break; // BREAKS EVERYTHING!
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("akk") ||
+ language.equals("qam") ||
+ language.equals("qpc") ||
+ language.equals("elx") ||
+ language.equals("sux") ||
+ language.equals("hit") ||
+ language.equals("qhu") ||
+ language.equals("peo") ||
+ language.equals("uga") ||
+ language.equals("ura") ||
+ language.equals("qcu")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ char last = '\u0000';
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ c = Character.toLowerCase(c);
+ String replace = new String();
+ switch (c) {
+ case '{': replace += "-"; break;
+ case '}': replace += "-"; break;
+ // These are from PSD::ATF::Unicode by Steve Tinney
+ case '\u0161': replace += "sz"; break;
+ case '\u1e63': replace += "s,"; break;
+ case '\u1e6d': replace += "t,"; break;
+ case '\u014b': replace += "j"; break;
+ case '\u015b': replace += "s'"; break;
+ case '\u2080': replace += "0"; break;
+ case '\u2081': replace += "1"; break;
+ case '\u2082': replace += "2"; break;
+ case '\u2083': replace += "3"; break;
+ case '\u2084': replace += "4"; break;
+ case '\u2085': replace += "5"; break;
+ case '\u2086': replace += "6"; break;
+ case '\u2087': replace += "7"; break;
+ case '\u2088': replace += "8"; break;
+ case '\u2089': replace += "9"; break;
+
+ case 'c': // shin (except where used as modifier)
+ if ((i > 0) && ((last == '~') || (last == '@')))
+ replace += "c";
+ else replace += "sz";
+ break;
+ default: replace += c; break;
+ }
+ // suppress grapheme boundary before or after word boundary
+ if (replace.equals("-")) {
+ if ((i + 1 == s.length()) || (s.charAt(i + 1) == ' ') || (i == 0) || (s.charAt(i - 1) == ' '))
+ replace = "";
+ }
+ last = c;
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("el") || language.equals("grc")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA
+ case '<': break;
+ case '>': break;
+ case '[': break;
+ case ']': break;
+ case '1': break;
+ case '2': break;
+ case '\u03ac': replace = "\u1f71"; break;
+ case '\u03ad': replace = "\u1f73"; break;
+ case '\u03ae': replace = "\u1f75"; break;
+ case '\u03af': replace = "\u1f77"; break;
+ case '\u03cc': replace = "\u1f79"; break;
+ case '\u03cd': replace = "\u1f7b"; break;
+ case '\u03ce': replace = "\u1f7d"; break;
+ case '-': break; // same treatment as soft hyphen
+ case '\u00ad': break; // soft hyphen
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else if (language.equals("el_atonic")) {
+ StringBuffer buf = new StringBuffer();
+ int n = 0;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ String replace = new String();
+ switch (c) {
+ case '\u03c2': replace = "\u03c3"; break; // GREEK SMALL LETTER FINAL SIGMA
+ // map characters with diacritics to their plain equivalent
+ // cf. BetaCode.java
+ case '\u03aa': replace = "\u0399"; break;
+ case '\u03ab': replace = "\u03a5"; break;
+ case '\u03ac': replace = "\u0381"; break;
+ case '\u03ad': replace = "\u0385"; break;
+ case '\u03ae': replace = "\u0387"; break;
+ case '\u03af': replace = "\u0389"; break;
+ case '\u03ca': replace = "\u03b9"; break;
+ case '\u03cb': replace = "\u03c5"; break;
+ case '\u03cc': replace = "\u03bf"; break;
+ case '\u03cd': replace = "\u03c5"; break;
+ case '\u03ce': replace = "\u03c9"; break;
+ case '\u1f00': replace = "\u03b1"; break;
+ case '\u1f01': replace = "\u03b1"; break;
+ case '\u1f02': replace = "\u03b1"; break;
+ case '\u1f03': replace = "\u03b1"; break;
+ case '\u1f04': replace = "\u03b1"; break;
+ case '\u1f05': replace = "\u03b1"; break;
+ case '\u1f06': replace = "\u03b1"; break;
+ case '\u1f07': replace = "\u03b1"; break;
+ case '\u1f08': replace = "\u0391"; break;
+ case '\u1f09': replace = "\u0391"; break;
+ case '\u1f0a': replace = "\u0391"; break;
+ case '\u1f0b': replace = "\u0391"; break;
+ case '\u1f0c': replace = "\u0391"; break;
+ case '\u1f0d': replace = "\u0391"; break;
+ case '\u1f0e': replace = "\u0391"; break;
+ case '\u1f0f': replace = "\u0391"; break;
+ case '\u1f10': replace = "\u03b5"; break;
+ case '\u1f11': replace = "\u03b5"; break;
+ case '\u1f12': replace = "\u03b5"; break;
+ case '\u1f13': replace = "\u03b5"; break;
+ case '\u1f14': replace = "\u03b5"; break;
+ case '\u1f15': replace = "\u03b5"; break;
+ case '\u1f18': replace = "\u0395"; break;
+ case '\u1f19': replace = "\u0395"; break;
+ case '\u1f1a': replace = "\u0395"; break;
+ case '\u1f1b': replace = "\u0395"; break;
+ case '\u1f1c': replace = "\u0395"; break;
+ case '\u1f1d': replace = "\u0395"; break;
+ case '\u1f20': replace = "\u03b7"; break;
+ case '\u1f21': replace = "\u03b7"; break;
+ case '\u1f22': replace = "\u03b7"; break;
+ case '\u1f23': replace = "\u03b7"; break;
+ case '\u1f24': replace = "\u03b7"; break;
+ case '\u1f25': replace = "\u03b7"; break;
+ case '\u1f26': replace = "\u03b7"; break;
+ case '\u1f27': replace = "\u03b7"; break;
+ case '\u1f28': replace = "\u0397"; break;
+ case '\u1f29': replace = "\u0397"; break;
+ case '\u1f2a': replace = "\u0397"; break;
+ case '\u1f2b': replace = "\u0397"; break;
+ case '\u1f2c': replace = "\u0397"; break;
+ case '\u1f2d': replace = "\u0397"; break;
+ case '\u1f2e': replace = "\u0397"; break;
+ case '\u1f2f': replace = "\u0397"; break;
+ case '\u1f30': replace = "\u03b9"; break;
+ case '\u1f31': replace = "\u03b9"; break;
+ case '\u1f32': replace = "\u03b9"; break;
+ case '\u1f33': replace = "\u03b9"; break;
+ case '\u1f34': replace = "\u03b9"; break;
+ case '\u1f35': replace = "\u03b9"; break;
+ case '\u1f36': replace = "\u03b9"; break;
+ case '\u1f37': replace = "\u03b9"; break;
+ case '\u1f38': replace = "\u0399"; break;
+ case '\u1f39': replace = "\u0399"; break;
+ case '\u1f3a': replace = "\u0399"; break;
+ case '\u1f3b': replace = "\u0399"; break;
+ case '\u1f3c': replace = "\u0399"; break;
+ case '\u1f3d': replace = "\u0399"; break;
+ case '\u1f3e': replace = "\u0399"; break;
+ case '\u1f3f': replace = "\u0399"; break;
+ case '\u1f40': replace = "\u03bf"; break;
+ case '\u1f41': replace = "\u03bf"; break;
+ case '\u1f42': replace = "\u03bf"; break;
+ case '\u1f43': replace = "\u03bf"; break;
+ case '\u1f44': replace = "\u03bf"; break;
+ case '\u1f45': replace = "\u03bf"; break;
+ case '\u1f48': replace = "\u039f"; break;
+ case '\u1f49': replace = "\u039f"; break;
+ case '\u1f4a': replace = "\u039f"; break;
+ case '\u1f4b': replace = "\u039f"; break;
+ case '\u1f4c': replace = "\u039f"; break;
+ case '\u1f4d': replace = "\u039f"; break;
+ case '\u1f50': replace = "\u03c5"; break;
+ case '\u1f51': replace = "\u03c5"; break;
+ case '\u1f52': replace = "\u03c5"; break;
+ case '\u1f53': replace = "\u03c5"; break;
+ case '\u1f54': replace = "\u03c5"; break;
+ case '\u1f55': replace = "\u03c5"; break;
+ case '\u1f56': replace = "\u03c5"; break;
+ case '\u1f57': replace = "\u03c5"; break;
+ case '\u1f58': replace = "\u03a5"; break;
+ case '\u1f59': replace = "\u03a5"; break;
+ case '\u1f5a': replace = "\u03a5"; break;
+ case '\u1f5b': replace = "\u03a5"; break;
+ case '\u1f5c': replace = "\u03a5"; break;
+ case '\u1f5d': replace = "\u03a5"; break;
+ case '\u1f5e': replace = "\u03a5"; break;
+ case '\u1f5f': replace = "\u03a5"; break;
+ case '\u1f60': replace = "\u03c9"; break;
+ case '\u1f61': replace = "\u03c9"; break;
+ case '\u1f62': replace = "\u03c9"; break;
+ case '\u1f63': replace = "\u03c9"; break;
+ case '\u1f64': replace = "\u03c9"; break;
+ case '\u1f65': replace = "\u03c9"; break;
+ case '\u1f66': replace = "\u03c9"; break;
+ case '\u1f67': replace = "\u03c9"; break;
+ case '\u1f68': replace = "\u03a9"; break;
+ case '\u1f69': replace = "\u03a9"; break;
+ case '\u1f6a': replace = "\u03a9"; break;
+ case '\u1f6b': replace = "\u03a9"; break;
+ case '\u1f6c': replace = "\u03a9"; break;
+ case '\u1f6d': replace = "\u03a9"; break;
+ case '\u1f6e': replace = "\u03a9"; break;
+ case '\u1f6f': replace = "\u03a9"; break;
+ case '\u1f70': replace = "\u03b1"; break;
+ case '\u1f71': replace = "\u03b1"; break;
+ case '\u1f72': replace = "\u03b5"; break;
+ case '\u1f73': replace = "\u03b5"; break;
+ case '\u1f74': replace = "\u03b7"; break;
+ case '\u1f75': replace = "\u03b7"; break;
+ case '\u1f76': replace = "\u03b9"; break;
+ case '\u1f77': replace = "\u03b9"; break;
+ case '\u1f78': replace = "\u03bf"; break;
+ case '\u1f79': replace = "\u03bf"; break;
+ case '\u1f7a': replace = "\u03c5"; break;
+ case '\u1f7b': replace = "\u03c5"; break;
+ case '\u1f7c': replace = "\u03c9"; break;
+ case '\u1f7d': replace = "\u03c9"; break;
+ case '\u1f80': replace = "\u03b1"; break;
+ case '\u1f81': replace = "\u03b1"; break;
+ case '\u1f82': replace = "\u03b1"; break;
+ case '\u1f83': replace = "\u03b1"; break;
+ case '\u1f84': replace = "\u03b1"; break;
+ case '\u1f85': replace = "\u03b1"; break;
+ case '\u1f86': replace = "\u03b1"; break;
+ case '\u1f87': replace = "\u03b1"; break;
+ case '\u1f88': replace = "\u0391"; break;
+ case '\u1f89': replace = "\u0391"; break;
+ case '\u1f8a': replace = "\u0391"; break;
+ case '\u1f8b': replace = "\u0391"; break;
+ case '\u1f8c': replace = "\u0391"; break;
+ case '\u1f8d': replace = "\u0391"; break;
+ case '\u1f8e': replace = "\u0391"; break;
+ case '\u1f8f': replace = "\u0391"; break;
+ case '\u1f90': replace = "\u03b7"; break;
+ case '\u1f91': replace = "\u03b7"; break;
+ case '\u1f92': replace = "\u03b7"; break;
+ case '\u1f93': replace = "\u03b7"; break;
+ case '\u1f94': replace = "\u03b7"; break;
+ case '\u1f95': replace = "\u03b7"; break;
+ case '\u1f96': replace = "\u03b7"; break;
+ case '\u1f97': replace = "\u03b7"; break;
+ case '\u1f98': replace = "\u0397"; break;
+ case '\u1f99': replace = "\u0397"; break;
+ case '\u1f9a': replace = "\u0397"; break;
+ case '\u1f9b': replace = "\u0397"; break;
+ case '\u1f9c': replace = "\u0397"; break;
+ case '\u1f9d': replace = "\u0397"; break;
+ case '\u1f9e': replace = "\u0397"; break;
+ case '\u1f9f': replace = "\u0397"; break;
+ case '\u1fa0': replace = "\u03c9"; break;
+ case '\u1fa1': replace = "\u03c9"; break;
+ case '\u1fa2': replace = "\u03c9"; break;
+ case '\u1fa3': replace = "\u03c9"; break;
+ case '\u1fa4': replace = "\u03c9"; break;
+ case '\u1fa5': replace = "\u03c9"; break;
+ case '\u1fa6': replace = "\u03c9"; break;
+ case '\u1fa7': replace = "\u03c9"; break;
+ case '\u1fa8': replace = "\u03a9"; break;
+ case '\u1fa9': replace = "\u03a9"; break;
+ case '\u1faa': replace = "\u03a9"; break;
+ case '\u1fab': replace = "\u03a9"; break;
+ case '\u1fac': replace = "\u03a9"; break;
+ case '\u1fad': replace = "\u03a9"; break;
+ case '\u1fae': replace = "\u03a9"; break;
+ case '\u1faf': replace = "\u03a9"; break;
+ case '\u1fb2': replace = "\u03b1"; break;
+ case '\u1fb3': replace = "\u03b1"; break;
+ case '\u1fb4': replace = "\u03b1"; break;
+ case '\u1fb6': replace = "\u03b1"; break;
+ case '\u1fb7': replace = "\u03b1"; break;
+ case '\u1fba': replace = "\u0391"; break;
+ case '\u1fbb': replace = "\u0391"; break;
+ case '\u1fbc': replace = "\u0391"; break;
+ case '\u1fc2': replace = "\u03b7"; break;
+ case '\u1fc3': replace = "\u03b7"; break;
+ case '\u1fc4': replace = "\u03b7"; break;
+ case '\u1fc6': replace = "\u03b7"; break;
+ case '\u1fc7': replace = "\u03b7"; break;
+ case '\u1fca': replace = "\u0397"; break;
+ case '\u1fcb': replace = "\u0397"; break;
+ case '\u1fcc': replace = "\u0397"; break;
+ case '\u1fd2': replace = "\u03b9"; break;
+ case '\u1fd3': replace = "\u03b9"; break;
+ case '\u1fd6': replace = "\u03b9"; break;
+ case '\u1fd7': replace = "\u03b9"; break;
+ case '\u1fda': replace = "\u0399"; break;
+ case '\u1fdb': replace = "\u039f"; break;
+ case '\u1fe2': replace = "\u03c5"; break;
+ case '\u1fe3': replace = "\u03c5"; break;
+ case '\u1fe4': replace = "\u03c1"; break;
+ case '\u1fe5': replace = "\u03c1"; break;
+ case '\u1fe6': replace = "\u03c5"; break;
+ case '\u1fe7': replace = "\u03c5"; break;
+ case '\u1fea': replace = "\u03a5"; break;
+ case '\u1feb': replace = "\u03a5"; break;
+ case '\u1fec': replace = "\u03a1"; break;
+ case '\u1ff2': replace = "\u03c9"; break;
+ case '\u1ff3': replace = "\u03c9"; break;
+ case '\u1ff4': replace = "\u03c9"; break;
+ case '\u1ff6': replace = "\u03c9"; break;
+ case '\u1ff7': replace = "\u03c9"; break;
+ case '\u1ff8': replace = "\u039f"; break;
+ case '\u1ff9': replace = "\u039f"; break;
+ case '\u1ffa': replace = "\u03a9"; break;
+ case '\u1ffb': replace = "\u03a9"; break;
+ case '\u1ffc': replace = "\u03a9"; break;
+
+ case '<': break;
+ case '>': break;
+ case '-': break; // same treatment as soft hyphen
+ case '\u00ad': break; // soft hyphen
+ default: replace += c; break;
+ }
+ buf.append(replace);
+ // update offsets if replacement is a different length
+ if (offsets != null) {
+ int r = replace.length();
+ if (r == 0)
+ this.offsets = arrayKill(this.offsets, i - n);
+ else if (r == 2)
+ this.offsets = arrayInsert(this.offsets, i - n + 1, this.offsets[i - n], r - 1);
+ n += 1 - r;
+ }
+ }
+ return buf.toString();
+ } else { // unknown or no language
+ return s;
+ }
+ }
+
+ /*
+ // explicit words
+ normStr = normStr.replaceAll("aliàs", "alias");
+ normStr = normStr.replaceAll("hîc", "hic");
+ normStr = normStr.replaceAll("quòd", "quod");
+ normStr = normStr.replaceAll("Quòd", "Quod");
+ normStr = normStr.replaceAll("QVòd", "Quod");
+ normStr = normStr.replaceAll("Cùmque", "Cumque");
+ normStr = normStr.replaceAll("aër", "aer");
+ // ij
+ normStr = normStr.replaceAll("ij", "ii");
+ // qu/qv
+ normStr = normStr.replaceAll("qv", "qu");
+ // normStr = normStr.replaceAll("qV", "qU");
+ normStr = normStr.replaceAll("Qv", "Qu");
+ normStr = normStr.replaceAll("QV", "QU");
+ // u/v
+ String vowels = getVowels();
+ String consonants = getConsonants();
+ normStr = normStr.replaceAll("([" + vowels + "])([-]*)u([" + vowels +"])", "$1$2v$3"); // vowel + u + vowel --> vowel + v + vowel
+ normStr = normStr.replaceAll("([" + vowels + "])([-]*)U([" + vowels +"])", "$1$2V$3"); // vowel + U + vowel --> vowel + V + vowel
+ normStr = normStr.replaceAll("([" + consonants + "])([-]*)v([" + consonants +"])", "$1$2u$3"); // consonant + v + consonant --> consonant + u + consonant
+ normStr = normStr.replaceAll("([" + consonants + "])([-]*)V([" + consonants +"])", "$1$2U$3"); // consonant + V + consonant --> consonant + U + consonant
+ normStr = normStr.replaceAll("^v([" + consonants +"])", "u$1"); // v + consonant --> u + consonant
+ normStr = normStr.replaceAll("^V([" + consonants +"])", "U$1"); // V + consonant --> U + consonant
+ // end of word: diacritica
+ normStr = normStr.replaceAll("à$", "a");
+ normStr = normStr.replaceAll("è$", "e");
+ normStr = normStr.replaceAll("ò$", "o");
+ normStr = normStr.replaceAll("àm$", "am");
+ normStr = normStr.replaceAll("ùm$", "um");
+ String normStrTmp = normStr;
+ normStr = "";
+ for (int i = 0; i < normStrTmp.length(); i++) {
+ char c = normStrTmp.charAt(i);
+ String replace = "";
+ switch (c) {
+ case 'ſ': replace = "s"; break;
+ case 'ß': replace = "ss"; break;
+ case 'æ': replace = "ae"; break;
+ case 'Æ': replace = "AE"; break;
+ case 'ę': replace = "ae"; break;
+ case 'œ': replace = "oe"; break;
+ default: replace += c; break;
+ }
+ normStr = normStr + replace;
+ }
+
+
+ private String getVowels() {
+ String retStr = null;
+ if (Language.getInstance().isItalian(language)) {
+ retStr = "AEIOUaeiou" +
+ "\u00c6\u00e6" + // AE ligatures
+ "\u0152\u0153"; // OE ligatures
+ } else if (Language.getInstance().isLatin(language)) {
+ retStr = "AEIOUaeiouÆœęàèòù";
+ }
+ return retStr;
+ }
+
+ private String getConsonants() {
+ String retStr = null;
+ if (Language.getInstance().isItalian(language)) {
+ retStr = "BCDFGHKLMNPQRSTVWXZ" +
+ "bcdfghklmnpqrstvwxz" +
+ "ſß"; // long/sharp S
+ } else if (Language.getInstance().isLatin(language)) {
+ retStr = "BCDFGHKLMNPQRSTVWXZ" +
+ "bcdfghklmnpqrstvwxz" +
+ "ſß"; // long/sharp S
+ }
+ return retStr;
+ }
+
+
+
+
+
+ *
+ *
+ *
+ *
+ */
+
+
+
+
+
+
+ /**
+ * Returns a copy of an integer array with the element at
+ * index
removed ("killed").
+ *
+ * @param array integer array
+ * @param index index of element to remove
+ */
+ private int[] arrayKill(int[] array, int index) {
+ int[] newArray = new int[array.length - 1];
+ System.arraycopy(array, 0, newArray, 0, index);
+ System.arraycopy(array, index + 1, newArray, index, array.length - index - 1);
+ return newArray;
+ }
+
+ /**
+ * Returns a copy of an integer array with count
elements
+ * inserted at index
.
+ *
+ * @param array integer array
+ * @param index index to insert new elements
+ * @param value value to insert into new slots
+ * @param count number of new slots to insert
+ */
+ private int[] arrayInsert(int[] array, int index, int value, int count) {
+ int[] newArray = new int[array.length + count];
+ System.arraycopy(array, 0, newArray, 0, index);
+ for (int i = 0; i < count; i++) newArray[index + i] = value;
+ System.arraycopy(array, index, newArray, index + count, array.length - index);
+ return newArray;
+ }
+
+}
\ No newline at end of file
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,584 @@
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+
+/*
+ * Normalization rules for Arabic text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-02-28
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 21.07.11 11:22 from the specification file
+ * MpdlNormalizerLexAR.lex
+ */
+public class MpdlNormalizerLexAR {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\4"+
+ "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+ "\uff82\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\4\0\2\1\1\2\1\3\1\4\1\5";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[10];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\5\0\12\0\17\0\24\0\31\0\24\0\24"+
+ "\0\24\0\24";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[10];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\5\1\6\1\5\1\0\1\7\1\5\1\6\1\5"+
+ "\1\10\1\7\1\5\1\6\1\5\1\11\1\7\1\5"+
+ "\1\6\1\5\1\12\1\7\7\0\1\5\2\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[30];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\4\0\1\11\1\1\4\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[10];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexAR(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexAR(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 42) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 5:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 6: break;
+ case 4:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 7: break;
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 8: break;
+ case 3:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 9: break;
+ case 1:
+ { add(yytext());
+ }
+ case 10: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAR.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,90 @@
+/*
+ * Normalization rules for Arabic text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-02-28
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAR
+%type java.lang.String
+%unicode
+
+// Arabic: ar
+
+%states DISP, DICT, SEARCH
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+%%
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+AR: fehlt noch
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexAll.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,143 @@
+/*
+ * Normalization rules for all languages
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * 2011-01-25
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexAll
+%type java.lang.String
+%unicode
+// %debug
+
+%states LA, ZH
+
+%{
+ int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+%}
+
+VOWEL=[AEIOUaeiouÆæęàèòùœ]
+CONS=[BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR=[lLrR]
+QUE=(que)?
+END=\n
+
+%%
+
+ {
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ { cv = 1; return "s"; }
+ß { cv = 1; return "ss"; }
+[æę] { cv = 2; return "ae"; }
+Æ { cv = 2; return "AE"; }
+œ { cv = 2; return "oe"; }
+// 1.2 character combinations
+ij { cv = 2; return "ii"; }
+
+// 2. diacritics
+
+// 2.1 superfluous diacritics in single words
+^ hîc {END} { return "hic"; }
+
+// 2.2 superfluous diacritics at the end of a word
+// 2.2.1 common cases
+à / {QUE} {END} { return "a"; }
+àm / {QUE} {END} { return "am"; }
+às / {QUE} {END} { return "as"; } // (-àsque will likely never occur)
+// à / [ms]? {QUE} {END} { return "a"; }
+è / {QUE} {END} { return "e"; }
+ò / {QUE} {END} { return "o"; }
+òd / {QUE} {END} { return "od"; }
+ùm / {QUE} {END} { return "um"; }
+ùs / {QUE} {END} { return "us"; }
+
+// 2.3 superfluous diacritics within a word
+// 2.3.1 common cases
+aë { cv = 2; return "ae"; }
+oë { cv = 2; return "oe"; }
+// 2.3.2 rare cases
+oï { cv = 2; return "oi"; }
+uï { cv = 2; return "ui"; }
+// 2.3.3 extremely rare cases
+uü { cv = 2; return "uu"; }
+
+// 3. rules for u and v
+
+// 3.1 rules for u
+
+u/{VOWEL} {
+ switch(cv) {
+ case 2: return "v";
+ default: cv = 2; return "u";
+ }
+ }
+U/{VOWEL} {
+ switch(cv) {
+ case 2: return "V";
+ default: cv = 2; return "U";
+ }
+ }
+
+// 3.2 rules for v
+
+qv { cv = 1; return "qu"; } // the replaced v still counts as consonant
+Qv { cv = 1; return "Qu"; }
+QV { cv = 1; return "QU"; }
+
+{LR}v {
+ switch(cv) {
+ case 1: return yytext().replace("v", "u");
+ default: cv = 1; return yytext();
+ }
+ }
+{LR}V {
+ switch(cv) {
+ case 1: return yytext().replace("V", "U");
+ default: cv = 1; return yytext();
+ }
+ }
+
+v/{CONS} { cv = 1; return "u"; }
+V/{CONS} { cv = 1; return "U"; }
+
+
+// default
+
+{VOWEL} { cv = 2; return yytext(); }
+{CONS} { cv = 1; return yytext(); }
+\n { cv = 0; return ""; }
+. { cv = 0; return yytext(); }
+
+}
+
+ {
+
+// Codepoint < FFFF
+
+竒 { return "奇"; } // 7AD2 --> 5947
+旹 { return "時"; } // 65F9 --> 6642
+歴 { return "歷"; } // 6B74 --> 6B77
+精 { return "精"; } // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+
+// Codepoint > FFFF
+
+庶 { return "庶"; } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph)
+
+
+}
+
+
+// default (can be overridden by individual languages)
+
+\n { return ""; }
+. { return yytext(); }
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,648 @@
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:34 */
+
+/*
+ * Normalization rules for German text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-08-10
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 05.09.11 10:34 from the specification file
+ * MpdlNormalizerLexDE.lex
+ */
+public class MpdlNormalizerLexDE {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 10;
+ public static final int DICT_ASCII = 6;
+ public static final int SEARCH_ASCII = 12;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+ public static final int GRIMM = 8;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+
+ "\32\4\6\0\1\11\2\4\1\5\12\4\1\13\5\4\1\7\5\4"+
+ "\1\1\1\0\1\1\106\0\1\14\21\0\1\15\5\0\1\16\2\0"+
+ "\1\17\4\0\1\14\21\0\1\15\5\0\1\16\202\0\1\6\u01e4\0"+
+ "\1\12\1\0\1\10\ufc99\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\7\0\2\1\1\2\1\3\1\4\3\1\1\5\1\3"+
+ "\3\1\1\6\1\7\1\10\1\11\1\12\1\13\1\14"+
+ "\1\15\1\16\1\17";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[30];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\167"+
+ "\0\210\0\167\0\167\0\167\0\231\0\252\0\273\0\167"+
+ "\0\210\0\314\0\335\0\356\0\167\0\167\0\167\0\167"+
+ "\0\167\0\167\0\167\0\167\0\167\0\167";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[30];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\10\1\11\1\10\1\0\1\10\1\11\1\12\1\11"+
+ "\1\10\1\11\6\10\1\13\1\10\1\11\1\10\1\14"+
+ "\1\10\1\11\1\12\1\15\1\10\1\16\1\10\1\17"+
+ "\4\10\1\13\1\10\1\11\1\10\1\20\1\10\1\11"+
+ "\1\12\1\15\1\10\1\16\1\10\1\17\4\10\2\13"+
+ "\1\21\1\13\1\20\1\10\1\11\1\12\1\22\1\13"+
+ "\1\23\1\13\1\24\1\25\1\26\1\27\1\30\1\13"+
+ "\1\10\1\11\1\10\1\20\1\10\1\11\1\12\1\15"+
+ "\1\10\1\16\1\10\1\17\3\10\1\31\1\13\1\10"+
+ "\1\11\1\10\1\32\1\10\1\11\1\12\1\15\1\10"+
+ "\1\16\1\10\1\17\4\10\2\13\1\21\1\13\1\32"+
+ "\1\10\1\11\1\12\1\22\1\13\1\23\1\13\1\24"+
+ "\1\25\1\26\1\27\1\30\1\13\23\0\1\10\20\0"+
+ "\1\10\5\0\1\33\1\0\1\34\10\0\1\10\7\0"+
+ "\1\35\20\0\1\36\10\0\1\10\5\0\1\33\1\0"+
+ "\1\27\10\0\1\10\7\0\1\25\20\0\1\26\6\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[255];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\7\0\1\11\1\1\3\11\3\1\1\11\4\1\12\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[30];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ public static final int CELEX = DICT_ASCII;
+
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexDE(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexDE(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 88) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 10:
+ { add("sz");
+ }
+ case 16: break;
+ case 3:
+ { problem = 1; add(yytext());
+ }
+ case 17: break;
+ case 6:
+ { add("ae");
+ }
+ case 18: break;
+ case 2:
+ { add("s");
+ }
+ case 19: break;
+ case 4:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 20: break;
+ case 13:
+ { add("ü");
+ }
+ case 21: break;
+ case 8:
+ { add("ue");
+ }
+ case 22: break;
+ case 11:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 23: break;
+ case 12:
+ { add("u");
+ }
+ case 24: break;
+ case 14:
+ { add("ä");
+ }
+ case 25: break;
+ case 1:
+ { add(yytext());
+ }
+ case 26: break;
+ case 9:
+ { add("ss");
+ }
+ case 27: break;
+ case 7:
+ { add("oe");
+ }
+ case 28: break;
+ case 15:
+ { add("ö");
+ }
+ case 29: break;
+ case 5:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 30: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexDE.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,134 @@
+/*
+ * Normalization rules for German text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexDE
+%type java.lang.String
+%unicode
+
+// German: de, deu, ger
+
+%states DISP, DICT, SEARCH
+%state CELEX, GRIMM
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+%%
+
+ſ { add("s"); }
+
+// Fraktur
+
+ {
+
+uͦ {add("u"); }
+aͤ {add("ä"); }
+oͤ {add("ö"); }
+uͤ {add("ü"); }
+
+}
+
+ {
+
+// normalize ä ö ü ß only for Celex!
+
+ä | Ä | aͤ { add("ae"); }
+ö | Ö | oͤ { add("oe"); }
+ü | Ü | uͤ { add("ue"); }
+uͦ {add("u"); }
+ß { add("ss"); }
+
+{Alphabet} { add(yytext()); }
+
+. { problem = 1; add(yytext()); }
+
+}
+
+ {
+
+ß { add("sz"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+DE: Trennung von Deutsch und Fraktur?
+DE: Celex: hyphens weg?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,711 @@
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */
+
+/*
+ * Normalization rules for Greek text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-08-03
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 05.09.11 10:35 from the specification file
+ * MpdlNormalizerLexEL.lex
+ */
+public class MpdlNormalizerLexEL {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int SIGMA = 8;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+ "\32\5\6\0\1\6\2\5\1\6\20\5\1\6\5\5\1\1\1\0"+
+ "\1\1\u032e\0\1\7\1\10\1\11\1\12\15\0\1\4\3\0\1\4"+
+ "\1\30\11\0\1\13\1\14\1\15\u1ba1\0\1\16\1\0\1\20\1\0"+
+ "\1\21\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26\65\0"+
+ "\1\17\17\0\1\22\57\0\1\27\ue00d\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\5\0\2\1\2\2\1\3\1\4\1\5\1\6\1\7"+
+ "\1\10\1\11\1\12\1\13\12\1\1\14\1\15\1\16"+
+ "\1\0\1\17\1\0\1\20\1\0\1\21\1\0\1\22"+
+ "\1\0\1\23\1\0\1\24\1\0\1\25\1\0\1\26"+
+ "\1\0\1\27\1\0";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[50];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\31\0\62\0\113\0\144\0\175\0\226\0\175"+
+ "\0\226\0\175\0\175\0\175\0\175\0\175\0\175\0\175"+
+ "\0\175\0\175\0\257\0\310\0\341\0\372\0\u0113\0\u012c"+
+ "\0\u0145\0\u015e\0\u0177\0\u0190\0\175\0\175\0\175\0\u01a9"+
+ "\0\175\0\u01c2\0\175\0\u01db\0\175\0\u01f4\0\175\0\u020d"+
+ "\0\175\0\u0226\0\175\0\u023f\0\175\0\u0258\0\175\0\u0271"+
+ "\0\175\0\u028a";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[50];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\6\1\7\1\6\1\0\1\6\1\10\1\11\1\12"+
+ "\1\13\1\14\1\15\1\16\1\17\1\20\14\6\1\7"+
+ "\1\6\1\21\1\6\1\10\1\11\1\12\1\13\1\14"+
+ "\1\15\1\16\1\17\1\20\14\6\1\7\1\6\1\22"+
+ "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+ "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+ "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\35"+
+ "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+ "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+ "\1\31\1\32\1\33\1\34\2\6\1\7\1\6\1\22"+
+ "\1\6\1\10\1\11\1\12\1\13\1\14\1\15\1\16"+
+ "\1\17\1\20\1\23\1\24\1\25\1\26\1\27\1\30"+
+ "\1\31\1\32\1\33\1\34\1\36\33\0\1\6\31\0"+
+ "\1\37\1\40\23\0\1\40\3\0\1\41\1\42\23\0"+
+ "\1\42\3\0\1\43\1\44\23\0\1\44\3\0\1\45"+
+ "\1\46\23\0\1\46\3\0\1\47\1\50\23\0\1\50"+
+ "\3\0\1\51\1\52\23\0\1\52\3\0\1\53\1\54"+
+ "\23\0\1\54\3\0\1\55\1\56\23\0\1\56\3\0"+
+ "\1\57\1\60\23\0\1\60\3\0\1\61\1\62\23\0"+
+ "\1\62\3\0\1\37\30\0\1\41\30\0\1\43\30\0"+
+ "\1\45\30\0\1\47\30\0\1\51\30\0\1\53\30\0"+
+ "\1\55\30\0\1\57\30\0\1\61\25\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[675];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\5\0\1\11\1\1\1\11\1\1\11\11\12\1\3\11"+
+ "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
+ "\1\0\1\11\1\0\1\11\1\0\1\11\1\0\1\11"+
+ "\1\0\1\11\1\0";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[50];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexEL(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexEL(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 112) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 23:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ῴ");
+ }
+ case 24: break;
+ case 5:
+ { add("ή");
+ }
+ case 25: break;
+ case 17:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ή");
+ }
+ case 26: break;
+ case 13:
+ { add("σ");
+ }
+ case 27: break;
+ case 6:
+ { add("ί");
+ }
+ case 28: break;
+ case 1:
+ { add(yytext());
+ }
+ case 29: break;
+ case 22:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ώ");
+ }
+ case 30: break;
+ case 11:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 31: break;
+ case 19:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ί");
+ }
+ case 32: break;
+ case 15:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ᾴ");
+ }
+ case 33: break;
+ case 7:
+ { add("ό");
+ }
+ case 34: break;
+ case 14:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ά");
+ }
+ case 35: break;
+ case 12:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 36: break;
+ case 8:
+ { add("ύ");
+ }
+ case 37: break;
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 38: break;
+ case 20:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ό");
+ }
+ case 39: break;
+ case 3:
+ { add("ά");
+ }
+ case 40: break;
+ case 10:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 41: break;
+ case 9:
+ { add("ώ");
+ }
+ case 42: break;
+ case 16:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("έ");
+ }
+ case 43: break;
+ case 18:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ῄ");
+ }
+ case 44: break;
+ case 4:
+ { add("έ");
+ }
+ case 45: break;
+ case 21:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("ύ");
+ }
+ case 46: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEL.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,139 @@
+/*
+ * Normalization rules for Greek text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-08-03
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexEL
+%type java.lang.String
+%unicode
+
+// Greek: el, grc
+
+%states DISP, DICT, SEARCH
+%state SIGMA
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+wordend = [νρς]? {END}
+
+Latin = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+
+%%
+
+
+// always replace tonos by oxia
+// (although this should really be corrected in the text rather than normalized)
+ά { add("ά"); }
+έ { add("έ"); }
+ή { add("ή"); }
+ί { add("ί"); }
+ό { add("ό"); }
+ύ { add("ύ"); }
+ώ { add("ώ"); }
+
+
+ {
+
+ὰ / {wordend} { add("ά"); }
+ᾲ / {wordend} { add("ᾴ"); }
+ὲ / {wordend} { add("έ"); }
+ὴ / {wordend} { add("ή"); }
+ῂ / {wordend} { add("ῄ"); }
+ὶ / {wordend} { add("ί"); }
+ὸ / {wordend} { add("ό"); }
+ὺ / {wordend} { add("ύ"); }
+ὼ / {wordend} { add("ώ"); }
+ῲ / {wordend} { add("ῴ"); }
+
+// other candidates: Ὰ Ὲ Ὴ Ὶ Ὺ Ὸ Ὼ
+
+}
+
+ {
+
+ς { add("σ"); }
+
+}
+
+// default
+
+@ { problem = 1; add(yytext()); }
+{Latin} { problem = 1; add(yytext()); }
+
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+EL: tonos --> oxia wieder rausnehmen, weil es im Text geändert werden muss?
+EL: gibt es noch weitere Fälle, wo legitimerweise ein Gravis vorkommen kann?
+EL: kommen Großbuchstaben mit Gravis bei uns jemals vor, und sollen sie normalisiert werden?
+EL: neuer State BETACODE ?
+EL: nicht falsche Zeichen definieren, sondern erlaubte Zeichen
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,589 @@
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+
+/*
+ * Normalization rules for English text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 21.07.11 11:22 from the specification file
+ * MpdlNormalizerLexEN.lex
+ */
+public class MpdlNormalizerLexEN {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+ "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+ "\u0101\0\1\4\ufe80\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\4\0\2\1\1\2\1\3\1\4\1\5\1\6";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+
+ "\0\30\0\30\0\30";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+
+ "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+
+ "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+
+ "\10\0\1\5\3\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[36];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\4\0\1\11\1\1\5\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexEN(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexEN(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 46) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 5:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 7: break;
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 8: break;
+ case 4:
+ { add("s");
+ }
+ case 9: break;
+ case 3:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 10: break;
+ case 6:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 11: break;
+ case 1:
+ { add(yytext());
+ }
+ case 12: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexEN.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,99 @@
+/*
+ * Normalization rules for English text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexEN
+%type java.lang.String
+%unicode
+
+// 1.5 English: en
+
+%states DISP, DICT, SEARCH
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+%%
+
+ {
+
+ſ { add("s"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+EN: vollständig?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,635 @@
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */
+
+/*
+ * Normalization rules for French text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-08-10
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 05.09.11 10:35 from the specification file
+ * MpdlNormalizerLexFR.lex
+ */
+public class MpdlNormalizerLexFR {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int DICT_ASCII = 8;
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\20"+
+ "\32\4\6\0\1\5\2\4\1\5\20\4\1\5\5\4\1\1\1\0"+
+ "\1\1\141\0\1\7\3\12\3\0\1\10\1\0\3\13\1\0\3\14"+
+ "\3\0\3\15\4\0\3\16\126\0\2\11\53\0\1\6\u1e99\0\1\17"+
+ "\udfe6\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\5\0\2\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+ "\1\10\1\2\1\11\1\12\1\13\1\14\1\15\1\16"+
+ "\1\17";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\21\0\42\0\63\0\104\0\125\0\146\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125\0\146\0\125"+
+ "\0\125\0\125\0\125\0\125\0\125\0\125";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\6\1\7\1\6\1\0\1\6\1\7\12\6\1\10"+
+ "\1\6\1\7\1\6\1\11\1\6\1\7\1\12\1\13"+
+ "\1\14\7\6\1\10\1\6\1\7\1\6\1\15\1\6"+
+ "\1\7\1\12\1\13\1\14\7\6\1\10\1\6\1\7"+
+ "\1\6\1\16\1\6\1\7\1\12\1\13\1\14\7\6"+
+ "\2\10\1\17\1\10\1\15\1\6\1\7\1\12\1\13"+
+ "\1\14\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+
+ "\1\10\23\0\1\6\16\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[119];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\5\0\1\11\1\1\7\11\1\1\7\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexFR(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexFR(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 82) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 16: break;
+ case 6:
+ { add("ae");
+ }
+ case 17: break;
+ case 4:
+ { add("s");
+ }
+ case 18: break;
+ case 13:
+ { add("o");
+ }
+ case 19: break;
+ case 3:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 20: break;
+ case 8:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 21: break;
+ case 14:
+ { add("u");
+ }
+ case 22: break;
+ case 1:
+ { add(yytext());
+ }
+ case 23: break;
+ case 12:
+ { add("i");
+ }
+ case 24: break;
+ case 15:
+ { add("");
+ }
+ case 25: break;
+ case 11:
+ { add("e");
+ }
+ case 26: break;
+ case 10:
+ { add("a");
+ }
+ case 27: break;
+ case 9:
+ { add("oe");
+ }
+ case 28: break;
+ case 5:
+ { add("ss");
+ }
+ case 29: break;
+ case 7:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 30: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexFR.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,119 @@
+/*
+ * Normalization rules for French text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexFR
+%type java.lang.String
+%unicode
+
+// French: fr
+
+%states DISP, DICT, SEARCH
+%state CELEX
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+Alphabet = [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]
+
+%%
+
+ {
+
+ſ { add("s"); }
+ß { add("ss"); }
+æ { add("ae"); }
+
+}
+
+ {
+
+[œŒ] { add("oe"); }
+[áàâ] { add("a"); }
+[éèê] { add("e"); }
+[íìî] { add("i"); }
+[óòô] { add("o"); }
+[úùû] { add("u"); }
+’ { add(""); }
+
+{Alphabet} { add(yytext()); }
+
+. { problem = 1; add(yytext()); } // in particular "@"
+
+}
+
+// default
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+FR: richtig? vollständig?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,887 @@
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+
+/*
+ * Normalization rules for Italian text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 21.07.11 11:22 from the specification file
+ * MpdlNormalizerLexIT.lex
+ */
+public class MpdlNormalizerLexIT {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 2, 3, 4, 5, 6
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\52\1\1\3\2"+
+ "\1\1\3\2\1\41\1\0\1\2\1\3\2\2\1\42\1\2\1\50"+
+ "\1\3\1\2\1\40\1\45\1\51\2\2\1\0\1\2\6\0\1\44"+
+ "\3\2\1\12\2\2\1\43\1\7\1\36\1\2\1\3\1\2\1\10"+
+ "\1\37\1\14\1\46\1\13\1\2\1\11\1\16\1\47\2\2\1\0"+
+ "\1\2\62\0\1\4\22\0\1\17\5\0\1\33\1\0\1\20\3\0"+
+ "\1\21\5\0\1\22\6\0\1\23\5\0\1\31\1\24\5\0\1\32"+
+ "\1\0\1\25\3\0\1\26\5\0\1\27\6\0\1\30\37\0\1\1"+
+ "\70\0\1\35\1\34\53\0\1\15\ufe80\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\11\0\1\1\1\2\2\3\1\1\1\4\1\2\1\3"+
+ "\1\5\1\2\1\6\1\7\1\10\1\11\1\12\5\3"+
+ "\1\13\1\2\1\3\1\5\1\2\1\14\1\15\1\16"+
+ "\1\17\1\20\1\21\1\22\1\23\1\24\1\25\1\26"+
+ "\1\27\1\30\4\0\1\31\1\32\1\33\1\0\1\34"+
+ "\1\0\1\35\1\36\1\0\1\37\1\40\1\41\4\0"+
+ "\1\42\6\0\1\43\1\44\4\0\1\45\1\0\1\46"+
+ "\10\0\1\47\4\0\1\45\2\0\1\50";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[100];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
+ "\0\u0158\0\0\0\0\0\0\0\u0183\0\u01ae\0\0\0\u01d9"+
+ "\0\u0204\0\0\0\u022f\0\0\0\0\0\0\0\0\0\0"+
+ "\0\u025a\0\u0285\0\u02b0\0\u02db\0\u0306\0\0\0\u0331\0\u035c"+
+ "\0\u0387\0\u03b2\0\u03dd\0\0\0\0\0\0\0\0\0\0"+
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\u0408"+
+ "\0\u0433\0\u045e\0\u0489\0\0\0\0\0\0\0\u04b4\0\0"+
+ "\0\u04df\0\0\0\0\0\u050a\0\0\0\0\0\0\0\u0535"+
+ "\0\u0560\0\u058b\0\u05b6\0\0\0\u05e1\0\u060c\0\u0637\0\u0662"+
+ "\0\u068d\0\0\0\0\0\0\0\u06b8\0\u06e3\0\u070e\0\u035c"+
+ "\0\u0739\0\u0764\0\0\0\u078f\0\u07ba\0\u07e5\0\0\0\u0810"+
+ "\0\u083b\0\u0866\0\u0891\0\0\0\u08bc\0\u08e7\0\u0912\0\u093d"+
+ "\0\0\0\u0968\0\u0993\0\0";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[100];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\53\0\1\12\1\13\1\14\1\15\1\16\1\12\1\17"+
+ "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+
+ "\5\12\2\13\1\12\2\13\1\24\1\25\1\26\1\27"+
+ "\1\30\1\12\1\13\1\31\2\13\1\14\1\13\1\23"+
+ "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+
+ "\1\15\1\16\1\12\1\17\1\37\1\14\1\21\1\13"+
+ "\1\15\1\40\1\41\1\42\5\12\2\13\1\12\2\13"+
+ "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+
+ "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+
+ "\1\36\1\12\1\13\1\14\1\15\1\16\1\12\1\44"+
+ "\1\20\1\14\1\21\1\13\1\15\1\14\1\22\1\23"+
+ "\1\45\1\46\1\47\1\50\1\51\1\52\1\53\1\54"+
+ "\1\55\1\56\1\24\1\25\1\26\1\27\1\30\1\12"+
+ "\1\13\1\31\2\13\1\14\1\13\1\23\1\32\1\33"+
+ "\1\34\1\35\1\36\1\12\1\13\1\14\1\15\1\16"+
+ "\1\12\1\44\1\37\1\14\1\21\1\13\1\15\1\40"+
+ "\1\41\1\42\1\45\1\46\1\47\1\50\1\51\1\52"+
+ "\1\53\1\54\1\55\1\56\1\24\1\25\1\26\1\27"+
+ "\1\30\1\12\1\13\1\31\2\13\1\43\1\13\1\42"+
+ "\1\32\1\33\1\34\1\35\1\36\1\12\1\13\1\14"+
+ "\1\15\1\16\1\12\1\57\1\20\1\14\1\21\1\13"+
+ "\1\15\1\14\1\22\1\23\1\45\1\46\1\47\1\50"+
+ "\1\51\1\52\1\53\1\54\1\55\1\56\1\24\1\25"+
+ "\1\26\1\27\1\30\1\12\1\13\1\31\2\13\1\14"+
+ "\1\13\1\23\1\32\1\33\1\34\1\35\1\36\1\12"+
+ "\1\13\1\14\1\15\1\16\1\12\1\57\1\37\1\14"+
+ "\1\21\1\13\1\15\1\40\1\41\1\42\1\45\1\46"+
+ "\1\47\1\50\1\51\1\52\1\53\1\54\1\55\1\56"+
+ "\1\24\1\25\1\26\1\27\1\30\1\12\1\13\1\31"+
+ "\2\13\1\43\1\13\1\42\1\32\1\33\1\34\1\35"+
+ "\1\36\7\0\1\60\4\0\1\61\1\62\42\0\1\63"+
+ "\114\0\1\64\1\0\1\64\6\0\1\65\103\0\1\66"+
+ "\23\0\1\67\44\0\1\70\5\0\1\70\2\0\1\70"+
+ "\3\0\1\70\5\0\2\70\1\0\2\70\1\0\3\70"+
+ "\2\0\1\70\1\0\2\70\1\0\2\70\46\0\1\71"+
+ "\60\0\1\72\5\0\2\73\1\74\3\0\2\73\1\0"+
+ "\3\73\13\0\1\73\6\0\1\73\2\0\1\73\2\0"+
+ "\4\73\50\0\1\75\1\0\1\76\3\0\2\77\1\100"+
+ "\3\0\2\77\1\0\3\77\13\0\1\77\6\0\1\77"+
+ "\2\0\1\77\2\0\4\77\11\0\1\101\25\0\1\66"+
+ "\26\0\1\102\52\0\1\102\3\0\1\103\35\0\1\104"+
+ "\5\0\1\104\2\0\1\104\3\0\1\104\5\0\2\104"+
+ "\1\0\2\104\1\0\3\104\2\0\1\104\1\0\2\104"+
+ "\1\0\2\104\44\0\1\105\4\0\1\106\16\0\1\107"+
+ "\54\0\1\110\52\0\1\110\3\0\1\111\40\0\1\112"+
+ "\105\0\1\113\55\0\1\114\15\0\1\115\52\0\1\116"+
+ "\51\0\1\117\4\0\1\120\54\0\1\121\43\0\1\122"+
+ "\7\0\1\120\44\0\1\123\52\0\1\123\1\124\1\125"+
+ "\46\0\1\126\4\0\1\61\54\0\1\127\43\0\1\130"+
+ "\7\0\1\61\40\0\2\73\4\0\2\73\1\0\3\73"+
+ "\13\0\1\73\6\0\1\73\2\0\1\73\2\0\4\73"+
+ "\3\0\2\77\4\0\2\77\1\0\3\77\13\0\1\77"+
+ "\6\0\1\77\2\0\1\77\2\0\4\77\6\0\1\131"+
+ "\51\0\1\132\53\0\1\133\53\0\1\134\50\0\1\135"+
+ "\3\0\1\136\47\0\1\137\52\0\1\140\56\0\1\120"+
+ "\46\0\1\141\61\0\1\120\43\0\1\142\104\0\1\143"+
+ "\24\0\1\61\55\0\1\61\46\0\1\136\50\0\1\144"+
+ "\44\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[2494];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\10\7\0\1\1\3\11\2\1\1\11\2\1\1\11"+
+ "\1\1\5\11\5\1\1\11\5\1\14\11\4\0\3\11"+
+ "\1\0\1\11\1\0\2\11\1\0\3\11\4\0\1\11"+
+ "\5\0\3\11\4\0\1\1\1\0\1\11\3\0\1\11"+
+ "\4\0\1\11\4\0\1\11\2\0\1\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[100];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /** For the backwards DFA of general lookahead statements */
+ private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1];
+
+ /* user code: */
+ private static final int CONS = 1;
+ private static final int VOWEL = 2;
+ private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexIT(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexIT(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 172) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ if (zzMarkedPosL > zzStartRead) {
+ switch (zzBufferL[zzMarkedPosL-1]) {
+ case '\n':
+ case '\u000B':
+ case '\u000C':
+ case '\u0085':
+ case '\u2028':
+ case '\u2029':
+ zzAtBOL = true;
+ break;
+ case '\r':
+ if (zzMarkedPosL < zzEndReadL)
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ else if (zzAtEOF)
+ zzAtBOL = false;
+ else {
+ boolean eof = zzRefill();
+ zzMarkedPosL = zzMarkedPos;
+ zzEndReadL = zzEndRead;
+ zzBufferL = zzBuffer;
+ if (eof)
+ zzAtBOL = false;
+ else
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ }
+ break;
+ default:
+ zzAtBOL = false;
+ }
+ }
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ if (zzAtBOL)
+ zzState = ZZ_LEXSTATE[zzLexicalState+1];
+ else
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 33:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = CONS; add("U");
+ }
+ case 41: break;
+ case 14:
+ { add("Á");
+ }
+ case 42: break;
+ case 40:
+ // lookahead expression with fixed lookahead length
+ yypushback(1);
+ { add(yytext());
+ }
+ case 43: break;
+ case 39:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 3;
+ { add(yytext());
+ }
+ case 44: break;
+ case 38:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add(yytext());
+ }
+ case 45: break;
+ case 26:
+ { add(yytext());
+ }
+ case 46: break;
+ case 21:
+ { add("í");
+ }
+ case 47: break;
+ case 8:
+ { cv = VOWEL; add("AE");
+ }
+ case 48: break;
+ case 11:
+ { problem = 1; cv = 0; add(yytext());
+ }
+ case 49: break;
+ case 4:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 50: break;
+ case 30:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = CONS; add("u");
+ }
+ case 51: break;
+ case 19:
+ { add("á");
+ }
+ case 52: break;
+ case 1:
+ { cv = 0; add(yytext());
+ }
+ case 53: break;
+ case 24:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 54: break;
+ case 34:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
+ }
+ case 55: break;
+ case 35:
+ { cv = VOWEL; add("zio");
+ }
+ case 56: break;
+ case 10:
+ { cv = VOWEL; add("OE");
+ }
+ case 57: break;
+ case 18:
+ { add("Ú");
+ }
+ case 58: break;
+ case 37:
+ // general lookahead, find correct zzMarkedPos
+ { int zzFState = 7;
+ int zzFPos = zzStartRead;
+ if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; }
+ boolean zzFinL[] = zzFin;
+ while (zzFState != -1 && zzFPos < zzMarkedPos) {
+ if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+ zzInput = zzBufferL[zzFPos++];
+ zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+ }
+ if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+
+ zzFState = 8;
+ zzFPos = zzMarkedPos;
+ while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) {
+ zzInput = zzBufferL[--zzFPos];
+ zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+ };
+ zzMarkedPos = zzFPos;
+ }
+ { cv = VOWEL; add(yytext().replace("ſ", "s"));
+ }
+ case 59: break;
+ case 3:
+ { cv = CONS; add(yytext());
+ }
+ case 60: break;
+ case 32:
+ { cv = CONS; add("QU");
+ }
+ case 61: break;
+ case 15:
+ { add("É");
+ }
+ case 62: break;
+ case 28:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { switch(cv) {
+ case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+ default: cv = VOWEL; add(yytext()); break;
+ }
+ }
+ case 63: break;
+ case 6:
+ { cv = CONS; add("ss");
+ }
+ case 64: break;
+ case 5:
+ { cv = CONS; add("s");
+ }
+ case 65: break;
+ case 13:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 66: break;
+ case 36:
+ { cv = VOWEL; add("ZIO");
+ }
+ case 67: break;
+ case 2:
+ { cv = VOWEL; add(yytext());
+ }
+ case 68: break;
+ case 17:
+ { add("Ó");
+ }
+ case 69: break;
+ case 23:
+ { add("ú");
+ }
+ case 70: break;
+ case 31:
+ { cv = CONS; add("Qu");
+ }
+ case 71: break;
+ case 20:
+ { add("é");
+ }
+ case 72: break;
+ case 7:
+ { cv = VOWEL; add("ae");
+ }
+ case 73: break;
+ case 12:
+ { add("");
+ }
+ case 74: break;
+ case 22:
+ { add("ó");
+ }
+ case 75: break;
+ case 9:
+ { cv = VOWEL; add("oe");
+ }
+ case 76: break;
+ case 29:
+ { cv = CONS; add("qu");
+ }
+ case 77: break;
+ case 25:
+ { switch(cv) {
+ case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+ default: cv = CONS; add(yytext()); break;
+ }
+ }
+ case 78: break;
+ case 27:
+ { cv = VOWEL; add("ii");
+ }
+ case 79: break;
+ case 16:
+ { add("Í");
+ }
+ case 80: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexIT.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,183 @@
+/*
+ * Normalization rules for Italian text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexIT
+%type java.lang.String
+%unicode
+
+// Italian: it, ita
+
+%states DISP, DICT, SEARCH
+
+%{
+ private static final int CONS = 1;
+ private static final int VOWEL = 2;
+ private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+Vowel = [AEIOUaeiouÆæęàèòùœ]
+Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+LR = [lLrR]
+
+
+hyphen = [\u002d\u00ad] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+lb = ({hyphen} \u0020)?
+
+END = \n
+
+prefixCons = (in{lb}ter | per | ſu{lb}per | ſer)
+
+%%
+
+ {
+
+À { add("Á"); }
+È { add("É"); }
+Ì { add("Í"); }
+Ò { add("Ó"); }
+Ù { add("Ú"); }
+à { add("á"); }
+è { add("é"); }
+ì { add("í"); }
+ò { add("ó"); }
+ù { add("ú"); }
+
+}
+
+ {
+
+ſ { cv = CONS; add("s"); }
+ß { cv = CONS; add("ss"); }
+æ { cv = VOWEL; add("ae"); }
+Æ { cv = VOWEL; add("AE"); }
+œ { cv = VOWEL; add("oe"); }
+Œ { cv = VOWEL; add("OE"); }
+
+ij { cv = VOWEL; add("ii"); }
+
+tio { cv = VOWEL; add("zio"); }
+TIO { cv = VOWEL; add("ZIO"); }
+
+// h-Regeln aus Arboreal:
+^ ha / {END} { add(yytext()); }
+^ hai / {END} { add(yytext()); }
+^ han{lb}no / {END} { add(yytext()); }
+^ ho / {END} { add(yytext()); }
+^ h { add(""); }
+
+
+// u/v rules are taken from MpdlNormalizerLexLA.lex
+
+// 1. rules for u --> v
+
+^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); }
+
+^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
+
+
+[uU] / {Vowel} {
+ switch(cv) {
+ case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+ default: cv = VOWEL; add(yytext()); break;
+ }
+ }
+
+// 2. rules for v --> u
+
+qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant
+Qv { cv = CONS; add("Qu"); }
+QV { cv = CONS; add("QU"); }
+
+{LR} [vV] {
+ switch(cv) {
+ case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+ default: cv = CONS; add(yytext()); break;
+ }
+ }
+
+v / {lb} {Cons} { cv = CONS; add("u"); }
+V / {lb} {Cons} { cv = CONS; add("U"); }
+
+// 3. override default rule for .
+
+{Vowel} { cv = VOWEL; add(yytext()); }
+{Cons} { cv = CONS; add(yytext()); }
+@ { problem = 1; cv = 0; add(yytext()); }
+{LB} { add(yytext()); }
+. { cv = 0; add(yytext()); }
+
+}
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+IT: all these rules are taken from Arboreal; do we need them all?
+IT: richtig? vollständig?
+IT: Sind die u/v-Regeln wirklich genau wie in LA ? insbesondere: gleiche Vokal-Klasse?
+IT: Änderungen in den lateinischen u/v-Regeln übernehmen?
+IT: italienische Beispielwörter für die u/v-Regeln angeben
+IT: Brauchen wir die Gravis-Regeln aus Arboreal in DICT wirklich?
+IT: wenn ja: gehört À --> Á etc. in die Wörterbuch-Schicht? Und einschränken auf letzte Silbe?
+IT: ist prefixCons = (inter | per | ſuper | ſer) auch für Italienisch gültig?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,1024 @@
+/* The following code was generated by JFlex 1.4.3 on 05.09.11 10:35 */
+
+/*
+ * Normalization rules for Latin text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 05.09.11 10:35 from the specification file
+ * MpdlNormalizerLexLA.lex
+ */
+public class MpdlNormalizerLexLA {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int RENAISSANCE_DICT = 8;
+ public static final int SEARCH = 10;
+ public static final int RENAISSANCE_DISP = 4;
+ public static final int DICT = 6;
+ public static final int YYINITIAL = 0;
+ public static final int RENAISSANCE_SEARCH = 12;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\6\25\0\1\5\14\0\1\4\22\0\1\0\1\1\3\2"+
+ "\1\1\2\2\1\53\1\1\1\0\1\2\1\3\2\2\1\1\1\2"+
+ "\1\46\1\3\2\2\1\64\1\65\2\2\1\66\1\2\6\0\1\57"+
+ "\1\2\1\47\1\43\1\11\2\2\1\51\1\14\1\27\1\2\1\50"+
+ "\1\40\1\13\1\61\1\17\1\7\1\16\1\32\1\15\1\10\1\12"+
+ "\2\2\1\66\1\2\62\0\1\4\30\0\1\25\30\0\1\23\1\37"+
+ "\1\31\1\55\3\0\1\24\1\0\1\41\1\33\1\0\1\60\1\45"+
+ "\1\34\1\52\1\62\2\0\1\42\1\35\1\54\4\0\1\44\1\36"+
+ "\1\56\1\63\34\0\1\24\71\0\1\26\53\0\1\20\u0181\0\1\30"+
+ "\ud4fe\0\1\21\u0590\0\1\22\u226e\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\12\0\1\1\1\2\2\3\1\1\1\4\1\3\1\2"+
+ "\1\3\1\2\1\5\1\1\1\6\1\7\1\10\1\11"+
+ "\11\1\1\3\2\1\3\2\1\3\1\12\1\3\2\2"+
+ "\1\3\1\5\3\3\1\1\1\2\1\13\1\14\4\0"+
+ "\1\15\1\16\1\17\1\20\1\0\1\21\1\22\1\23"+
+ "\1\24\1\0\1\25\20\0\1\26\3\0\1\27\3\0"+
+ "\1\30\1\0\1\31\3\0\1\32\1\33\1\34\1\0"+
+ "\1\35\1\36\2\0\1\37\20\0\1\40\1\0\1\41"+
+ "\1\0\1\42\1\0\1\43\1\44\1\45\1\46\1\0"+
+ "\1\47\1\0\1\50\1\0\1\51\1\0\1\52\4\0"+
+ "\1\53\10\0\1\54\6\0\1\55\3\0\1\56\1\57"+
+ "\1\60\2\0\1\61\5\0\1\53";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[179];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\67\0\156\0\245\0\334\0\u0113\0\u014a\0\u0181"+
+ "\0\u01b8\0\u01ef\0\u0226\0\u0226\0\u0226\0\u025d\0\u0294\0\u0226"+
+ "\0\u02cb\0\u0302\0\u0339\0\u0370\0\u0226\0\u01ef\0\u0226\0\u0226"+
+ "\0\u0226\0\u0226\0\u03a7\0\u03de\0\u0415\0\u044c\0\u0483\0\u04ba"+
+ "\0\u04f1\0\u0528\0\u055f\0\u0596\0\u05cd\0\u0604\0\u063b\0\u0672"+
+ "\0\u06a9\0\u06e0\0\u0226\0\u0717\0\u074e\0\u0785\0\u07bc\0\u07f3"+
+ "\0\u082a\0\u0861\0\u0898\0\u08cf\0\u0906\0\u0226\0\u0226\0\u093d"+
+ "\0\u0974\0\u09ab\0\u09e2\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a19"+
+ "\0\u0226\0\u0226\0\u0226\0\u0226\0\u0a50\0\u0226\0\u0a87\0\u0abe"+
+ "\0\u0af5\0\u0b2c\0\u0b63\0\u0b9a\0\u0bd1\0\u0c08\0\u0c3f\0\u0c76"+
+ "\0\u0cad\0\u0ce4\0\u0d1b\0\u0d52\0\u0d89\0\u0dc0\0\u0226\0\u0df7"+
+ "\0\u0e2e\0\u0e65\0\u0226\0\u0e9c\0\u0ed3\0\u0f0a\0\u0226\0\u0f41"+
+ "\0\u0226\0\u0f78\0\u0faf\0\u0fe6\0\u0226\0\u0226\0\u0226\0\u101d"+
+ "\0\u0226\0\u0226\0\u1054\0\u108b\0\u0226\0\u10c2\0\u10f9\0\u1130"+
+ "\0\u1167\0\u119e\0\u11d5\0\u120c\0\u1243\0\u127a\0\u0226\0\u12b1"+
+ "\0\u12e8\0\u131f\0\u1356\0\u138d\0\u08cf\0\u0226\0\u13c4\0\u0226"+
+ "\0\u13fb\0\u0226\0\u1432\0\u0226\0\u0226\0\u0226\0\u0226\0\u1469"+
+ "\0\u0226\0\u14a0\0\u0226\0\u14d7\0\u0226\0\u150e\0\u0226\0\u1545"+
+ "\0\u157c\0\u15b3\0\u07bc\0\u15ea\0\u1621\0\u1658\0\u168f\0\u16c6"+
+ "\0\u16fd\0\u0226\0\u1734\0\u176b\0\u0226\0\u17a2\0\u17d9\0\u1810"+
+ "\0\u1847\0\u187e\0\u18b5\0\u0226\0\u18ec\0\u1923\0\u195a\0\u0226"+
+ "\0\u0226\0\u0226\0\u1991\0\u19c8\0\u0226\0\u19ff\0\u1a36\0\u1a6d"+
+ "\0\u1aa4\0\u1adb\0\u0226";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[179];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\21\0\1\12\45\0\1\13\1\14\1\15\1\16\1\17"+
+ "\1\13\1\20\1\21\1\22\1\14\1\23\1\15\1\24"+
+ "\1\15\1\16\1\15\1\25\1\26\1\13\1\27\1\30"+
+ "\1\31\1\32\2\13\1\33\1\15\1\34\1\35\1\36"+
+ "\1\37\1\40\1\15\1\41\1\42\1\15\1\43\1\13"+
+ "\1\44\1\15\1\16\1\15\1\13\1\15\1\13\1\45"+
+ "\1\46\1\47\1\13\1\50\2\13\1\51\1\52\1\53"+
+ "\1\13\1\14\1\15\1\16\1\17\1\13\1\20\1\54"+
+ "\1\55\1\14\1\23\1\15\1\56\1\15\1\16\1\57"+
+ "\1\60\1\26\1\13\1\27\1\30\1\31\1\32\2\13"+
+ "\1\33\1\15\1\34\1\35\1\36\1\37\1\40\1\15"+
+ "\1\41\1\42\1\15\1\43\1\13\1\61\1\15\1\16"+
+ "\1\62\1\13\1\63\1\64\1\45\1\46\1\47\1\13"+
+ "\1\50\2\13\1\65\1\52\1\53\1\13\1\14\1\15"+
+ "\1\16\1\17\1\13\1\66\1\21\1\22\1\14\1\23"+
+ "\1\15\1\24\1\15\1\16\1\15\1\25\1\26\1\13"+
+ "\1\27\1\30\1\31\1\32\2\13\1\33\1\15\1\34"+
+ "\1\35\1\36\1\37\1\40\1\15\1\41\1\42\1\15"+
+ "\1\43\1\13\1\44\1\15\1\16\1\15\1\13\1\15"+
+ "\1\13\1\45\1\46\1\47\1\13\1\50\2\13\1\51"+
+ "\1\52\1\53\1\13\1\14\1\15\1\16\1\17\1\13"+
+ "\1\66\1\54\1\55\1\14\1\23\1\15\1\56\1\15"+
+ "\1\16\1\57\1\60\1\26\1\13\1\27\1\30\1\31"+
+ "\1\32\2\13\1\33\1\15\1\34\1\35\1\36\1\37"+
+ "\1\40\1\15\1\41\1\42\1\15\1\43\1\13\1\61"+
+ "\1\15\1\16\1\62\1\13\1\63\1\64\1\45\1\46"+
+ "\1\47\1\13\1\50\2\13\1\65\1\52\1\53\1\13"+
+ "\1\14\1\15\1\16\1\17\1\13\1\67\1\21\1\22"+
+ "\1\14\1\23\1\15\1\24\1\15\1\16\1\15\1\25"+
+ "\1\26\1\13\1\27\1\30\1\31\1\32\2\13\1\33"+
+ "\1\15\1\34\1\35\1\36\1\37\1\40\1\15\1\41"+
+ "\1\42\1\15\1\43\1\13\1\44\1\15\1\16\1\15"+
+ "\1\13\1\15\1\13\1\45\1\46\1\47\1\13\1\50"+
+ "\2\13\1\51\1\52\1\53\1\13\1\14\1\15\1\16"+
+ "\1\17\1\13\1\67\1\54\1\55\1\14\1\23\1\15"+
+ "\1\56\1\15\1\16\1\57\1\60\1\26\1\13\1\27"+
+ "\1\30\1\31\1\32\2\13\1\33\1\15\1\34\1\35"+
+ "\1\36\1\37\1\40\1\15\1\41\1\42\1\15\1\43"+
+ "\1\13\1\61\1\15\1\16\1\62\1\13\1\63\1\64"+
+ "\1\45\1\46\1\47\1\13\1\50\2\13\1\65\1\52"+
+ "\1\53\14\0\1\70\2\0\1\71\1\72\53\0\1\73"+
+ "\103\0\1\74\145\0\1\75\52\0\1\75\6\0\1\76"+
+ "\73\0\1\77\15\0\1\100\37\0\1\101\6\0\2\101"+
+ "\2\0\1\101\7\0\3\101\30\0\1\101\1\0\1\101"+
+ "\1\102\1\103\1\101\4\0\2\104\1\105\2\0\1\104"+
+ "\2\0\2\104\1\0\4\104\2\0\1\104\6\0\1\104"+
+ "\5\0\1\104\2\0\1\104\2\0\4\104\1\0\1\104"+
+ "\11\0\1\104\30\0\1\106\46\0\1\107\2\0\2\110"+
+ "\1\0\2\111\13\0\1\111\5\0\1\111\35\0\1\112"+
+ "\2\0\2\113\1\0\2\114\13\0\1\114\5\0\1\114"+
+ "\35\0\1\115\2\0\2\116\1\0\2\117\13\0\1\117"+
+ "\5\0\1\117\35\0\1\120\2\0\2\121\1\0\2\122"+
+ "\13\0\1\122\5\0\1\122\35\0\1\123\1\0\1\124"+
+ "\2\125\1\0\2\126\13\0\1\126\5\0\1\126\34\0"+
+ "\1\127\1\107\22\0\1\130\5\0\1\131\6\0\1\132"+
+ "\25\0\1\133\1\112\5\0\1\134\1\135\13\0\1\136"+
+ "\42\0\1\137\1\120\33\0\1\140\31\0\1\141\23\0"+
+ "\1\142\5\0\1\143\7\0\1\144\30\0\1\145\52\0"+
+ "\1\146\7\0\1\127\1\107\6\0\1\147\102\0\1\150"+
+ "\114\0\1\30\66\0\1\32\1\0\1\151\5\0\1\101"+
+ "\6\0\2\101\2\0\1\101\7\0\3\101\30\0\1\101"+
+ "\1\0\1\101\2\0\1\101\4\0\2\152\1\153\2\0"+
+ "\1\152\2\0\2\152\1\0\4\152\2\0\1\152\6\0"+
+ "\1\152\5\0\1\152\2\0\1\152\2\0\4\152\1\0"+
+ "\1\152\11\0\1\152\11\0\1\154\1\0\1\77\15\0"+
+ "\1\100\37\0\1\155\6\0\2\155\2\0\1\155\7\0"+
+ "\3\155\30\0\1\155\1\0\1\155\1\102\1\103\1\155"+
+ "\15\0\1\156\13\0\1\106\50\0\1\157\65\0\1\160"+
+ "\1\157\65\0\1\161\1\0\1\145\52\0\1\146\53\0"+
+ "\1\162\66\0\1\163\22\0\1\137\61\0\1\155\6\0"+
+ "\2\155\2\0\1\155\7\0\3\155\30\0\1\155\1\0"+
+ "\1\155\2\0\1\155\15\0\1\164\64\0\1\165\65\0"+
+ "\1\166\1\165\61\0\1\167\72\0\1\170\63\0\1\171"+
+ "\71\0\1\110\67\0\1\172\64\0\1\107\2\0\2\110"+
+ "\63\0\1\113\67\0\1\173\64\0\1\112\2\0\2\113"+
+ "\63\0\1\116\67\0\1\174\64\0\1\115\2\0\2\116"+
+ "\63\0\1\121\67\0\1\175\64\0\1\120\2\0\2\121"+
+ "\63\0\1\125\64\0\1\176\71\0\1\177\64\0\1\123"+
+ "\2\0\2\125\61\0\1\200\1\201\65\0\1\202\1\203"+
+ "\65\0\1\204\66\0\1\205\66\0\1\206\66\0\1\207"+
+ "\1\210\65\0\1\211\1\212\65\0\1\213\1\214\65\0"+
+ "\1\215\1\216\65\0\1\217\66\0\1\213\65\0\1\220"+
+ "\126\0\1\221\25\0\1\222\10\0\1\223\67\0\1\224"+
+ "\54\0\1\225\12\0\1\223\114\0\1\226\70\0\1\227"+
+ "\66\0\1\230\23\0\1\231\10\0\1\71\67\0\1\232"+
+ "\54\0\1\233\12\0\1\71\60\0\1\234\57\0\2\104"+
+ "\3\0\1\104\2\0\2\104\1\0\4\104\2\0\1\104"+
+ "\6\0\1\104\5\0\1\104\2\0\1\104\2\0\4\104"+
+ "\1\0\1\104\11\0\1\104\7\0\1\127\66\0\1\133"+
+ "\66\0\1\235\66\0\1\141\70\0\1\236\66\0\1\237"+
+ "\66\0\1\240\66\0\1\241\66\0\1\242\66\0\1\243"+
+ "\60\0\2\152\3\0\1\152\2\0\2\152\1\0\4\152"+
+ "\2\0\1\152\6\0\1\152\5\0\1\152\2\0\1\152"+
+ "\2\0\4\152\1\0\1\152\11\0\1\152\7\0\1\244"+
+ "\65\0\1\245\65\0\1\246\67\0\1\247\67\0\1\250"+
+ "\66\0\1\251\66\0\1\252\65\0\1\253\66\0\1\254"+
+ "\67\0\1\255\71\0\1\256\66\0\1\257\66\0\1\260"+
+ "\66\0\1\261\66\0\1\150\66\0\1\262\72\0\1\223"+
+ "\56\0\1\263\100\0\1\223\64\0\1\71\70\0\1\71"+
+ "\55\0\1\200\66\0\1\202\66\0\1\207\66\0\1\211"+
+ "\66\0\1\215\60\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[6930];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\10\0\1\1\1\0\3\11\2\1\1\11\4\1\1\11"+
+ "\1\1\4\11\20\1\1\11\12\1\2\11\4\0\4\11"+
+ "\1\0\4\11\1\0\1\11\20\0\1\11\3\0\1\11"+
+ "\3\0\1\11\1\0\1\11\3\0\3\11\1\0\2\11"+
+ "\2\0\1\11\11\0\1\11\6\0\1\11\1\0\1\11"+
+ "\1\0\1\11\1\0\4\11\1\0\1\11\1\0\1\11"+
+ "\1\0\1\11\1\0\1\11\4\0\1\1\5\0\1\11"+
+ "\2\0\1\11\6\0\1\11\3\0\3\11\2\0\1\11"+
+ "\5\0\1\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[179];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /** For the backwards DFA of general lookahead statements */
+ private boolean [] zzFin = new boolean [ZZ_BUFFERSIZE+1];
+
+ /* user code: */
+ private static final int CONS = 1;
+ private static final int VOWEL = 2;
+ private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexLA(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexLA(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 190) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ if (zzMarkedPosL > zzStartRead) {
+ switch (zzBufferL[zzMarkedPosL-1]) {
+ case '\n':
+ case '\u000B':
+ case '\u000C':
+ case '\u0085':
+ case '\u2028':
+ case '\u2029':
+ zzAtBOL = true;
+ break;
+ case '\r':
+ if (zzMarkedPosL < zzEndReadL)
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ else if (zzAtEOF)
+ zzAtBOL = false;
+ else {
+ boolean eof = zzRefill();
+ zzMarkedPosL = zzMarkedPos;
+ zzEndReadL = zzEndRead;
+ zzBufferL = zzBuffer;
+ if (eof)
+ zzAtBOL = false;
+ else
+ zzAtBOL = zzBufferL[zzMarkedPosL] != '\n';
+ }
+ break;
+ default:
+ zzAtBOL = false;
+ }
+ }
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ if (zzAtBOL)
+ zzState = ZZ_LEXSTATE[zzLexicalState+1];
+ else
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 41:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("um");
+ }
+ case 50: break;
+ case 30:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = CONS; add("U");
+ }
+ case 51: break;
+ case 15:
+ { add(yytext());
+ }
+ case 52: break;
+ case 48:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 3;
+ { add("Hic");
+ }
+ case 53: break;
+ case 8:
+ { cv = VOWEL; add("AE");
+ }
+ case 54: break;
+ case 1:
+ { problem = 1; cv = 0; add(yytext());
+ }
+ case 55: break;
+ case 4:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 56: break;
+ case 20:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = CONS; add("u");
+ }
+ case 57: break;
+ case 10:
+ { cv = 0; add(yytext());
+ }
+ case 58: break;
+ case 12:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 59: break;
+ case 36:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("et");
+ }
+ case 60: break;
+ case 23:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("e");
+ }
+ case 61: break;
+ case 31:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V"));
+ }
+ case 62: break;
+ case 43:
+ // general lookahead, find correct zzMarkedPos
+ { int zzFState = 7;
+ int zzFPos = zzStartRead;
+ if (zzFin.length <= zzBufferL.length) { zzFin = new boolean[zzBufferL.length+1]; }
+ boolean zzFinL[] = zzFin;
+ while (zzFState != -1 && zzFPos < zzMarkedPos) {
+ if ((zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+ zzInput = zzBufferL[zzFPos++];
+ zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+ }
+ if (zzFState != -1 && (zzAttrL[zzFState] & 1) == 1) { zzFinL[zzFPos] = true; }
+
+ zzFState = 8;
+ zzFPos = zzMarkedPos;
+ while (!zzFinL[zzFPos] || (zzAttrL[zzFState] & 1) != 1) {
+ zzInput = zzBufferL[--zzFPos];
+ zzFState = zzTransL[ zzRowMapL[zzFState] + zzCMapL[zzInput] ];
+ };
+ zzMarkedPos = zzFPos;
+ }
+ { cv = VOWEL; add(yytext().replace("ſ", "s"));
+ }
+ case 63: break;
+ case 3:
+ { cv = CONS; add(yytext());
+ }
+ case 64: break;
+ case 29:
+ { cv = VOWEL; add("oi");
+ }
+ case 65: break;
+ case 27:
+ { cv = CONS; add("QU");
+ }
+ case 66: break;
+ case 17:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { switch(cv) {
+ case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+ default: cv = VOWEL; add(yytext()); break;
+ }
+ }
+ case 67: break;
+ case 6:
+ { cv = CONS; add("ss");
+ }
+ case 68: break;
+ case 5:
+ { cv = CONS; add("s");
+ }
+ case 69: break;
+ case 11:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 70: break;
+ case 24:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("o");
+ }
+ case 71: break;
+ case 35:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("ac");
+ }
+ case 72: break;
+ case 2:
+ { cv = VOWEL; add(yytext());
+ }
+ case 73: break;
+ case 45:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 3;
+ { add("qui");
+ }
+ case 74: break;
+ case 37:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("er");
+ }
+ case 75: break;
+ case 26:
+ { cv = CONS; add("Qu");
+ }
+ case 76: break;
+ case 32:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("ve");
+ }
+ case 77: break;
+ case 40:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("us");
+ }
+ case 78: break;
+ case 34:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("am");
+ }
+ case 79: break;
+ case 7:
+ { cv = VOWEL; add("ae");
+ }
+ case 80: break;
+ case 28:
+ { add("ar");
+ }
+ case 81: break;
+ case 47:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 3;
+ { add("hic");
+ }
+ case 82: break;
+ case 19:
+ { cv = VOWEL; add("uu");
+ }
+ case 83: break;
+ case 42:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("ul");
+ }
+ case 84: break;
+ case 22:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("a");
+ }
+ case 85: break;
+ case 9:
+ { cv = VOWEL; add("oe");
+ }
+ case 86: break;
+ case 18:
+ { cv = VOWEL; add("ui");
+ }
+ case 87: break;
+ case 16:
+ { cv = CONS; add("qu");
+ }
+ case 88: break;
+ case 49:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 4;
+ { add("que");
+ }
+ case 89: break;
+ case 25:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("u");
+ }
+ case 90: break;
+ case 38:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("es");
+ }
+ case 91: break;
+ case 46:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 3;
+ { add("Qui");
+ }
+ case 92: break;
+ case 44:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 1;
+ { add("i");
+ }
+ case 93: break;
+ case 13:
+ { add("X");
+ }
+ case 94: break;
+ case 14:
+ { switch(cv) {
+ case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+ default: cv = CONS; add(yytext()); break;
+ }
+ }
+ case 95: break;
+ case 21:
+ { cv = VOWEL; add("ii");
+ }
+ case 96: break;
+ case 33:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("as");
+ }
+ case 97: break;
+ case 39:
+ // lookahead expression with fixed base length
+ zzMarkedPos = zzStartRead + 2;
+ { add("od");
+ }
+ case 98: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexLA.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,228 @@
+/*
+ * Normalization rules for Latin text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexLA
+%type java.lang.String
+%unicode
+
+// Latin: la, lat
+
+%states DISP, DICT, SEARCH
+%states RENAISSANCE_DISP, RENAISSANCE_DICT, RENAISSANCE_SEARCH
+
+%{
+ private static final int CONS = 1;
+ private static final int VOWEL = 2;
+ private int cv = 0; // consonant = 1, vowel = 2, everything else = 0
+
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+Vowel = [AEIOUaeiouÆæęœ] // without àèòù etc.
+Cons = [BCDFGHKLMNPQRSTVWXZbcdfghklmnpqrstvwxzſß]
+// y counts neither as Vowel nor as Cons, see the default rule below: [yY] { cv = 0; add(yytext()); }
+
+LR = [lLrR]
+
+hyphen = [\u002d\u00ad] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+lb = ({hyphen} \u0020)?
+
+END = \n
+
+que = (que)? // optional -que
+enclitic = (que | ve | ne)
+prefixCons = (in{lb}ter | per | ſu{lb}per | ſer) // "ſer" for forms of ſervare
+
+%%
+
+
+// TEST, siehe Benedetti Seite 444
+𐆑 { add("X"); } // (U+10191; D800+DD91)
+
+
+ {
+
+// 1. simple replacements
+
+// 1.1 single characters
+ſ { cv = CONS; add("s"); }
+ß { cv = CONS; add("ss"); }
+[æę] { cv = VOWEL; add("ae"); }
+Æ { cv = VOWEL; add("AE"); }
+œ { cv = VOWEL; add("oe"); }
+
+// 1.2 character combinations
+ij { cv = VOWEL; add("ii"); }
+
+// 2. superfluous diacritics
+
+// 2.1 acute accent
+q́ue / {END} { add("que"); } // G
+á / [mrst]? {enclitic} {END} { add("a"); } // G
+é / [mrst]? {enclitic} {END} { add("e"); } // G
+í / [mrst]? {enclitic} {END} { add("i"); } // G
+ó / [mrst]? {enclitic} {END} { add("o"); } // G
+ú / [mrst]? {enclitic} {END} { add("u"); } // G
+
+úe / {END} { add("ve"); } // W ??
+
+// 2.2 grave accent
+à / {que} {END} { add("a"); } // W G
+àm / {que} {END} { add("am"); } // W (G)
+às / {que} {END} { add("as"); } // W (G) (-àsque will likely never occur)
+è / {que} {END} { add("e"); } // W G
+ò / {que} {END} { add("o"); } // W G
+òd / {que} {END} { add("od"); } // W (G)
+ùm / {que} {END} { add("um"); } // W (G)
+ùs / {que} {END} { add("us"); } // W G
+
+ès / {que} {END} { add("es"); } // (G)
+^ quì / {END} { add("qui"); } // W ??
+^ Quì / {END} { add("Qui"); } // W ??
+àc / {END} { add("ac"); } // W ??
+èr / {END} { add("er"); } // W ??
+èt / {END} { add("et"); } // W ??
+ù / {END} { add("u"); } // W ??
+ùl / {END} { add("ul"); } // W ??
+
+// 2.3 circumflex accent
+^ hîc / {END} { add("hic"); } // W G
+^ Hîc / {END} { add("Hic"); } // W G
+^ ô / {END} { add("o"); } // G
+â / {que} {END} { add("a"); } // W G
+ûs / {END} { add("us"); } // W G
+âr { add("ar"); } // W (G) --> this is only a rough approximation!
+
+// 2.4 trema
+// 2.4.1 common cases
+aë { cv = VOWEL; add("ae"); }
+oë { cv = VOWEL; add("oe"); }
+// 2.4.2 rare cases
+oï { cv = VOWEL; add("oi"); }
+uï { cv = VOWEL; add("ui"); }
+// 2.4.3 extremely rare cases
+uü { cv = VOWEL; add("uu"); }
+
+
+// 3. rules for u and v
+
+// 3.1 rules for u --> v
+
+// peruenias --> pervenias, interuallum --> intervallum
+^ {prefixCons} / {lb} { cv = VOWEL; add(yytext().replace("ſ", "s")); } // not cv = CONS !
+
+// uellet --> vellet
+^ [uU] / {Vowel} { cv = VOWEL; add(yytext().replaceAll("u", "v").replaceAll("U", "V")); }
+
+// diuidatur --> dividatur
+// ut, volui: unchanged
+// no rule for veruina because we cannot distinguish it from volui
+[uU] / {Vowel} {
+ switch(cv) {
+ case VOWEL: add(yytext().replace("u", "v").replace("U", "V")); break;
+ default: cv = VOWEL; add(yytext()); break;
+ }
+ }
+
+// 3.2 rules for v --> u
+
+// qvam --> quam
+qv { cv = CONS; add("qu"); } // the replaced v still counts as consonant
+Qv { cv = CONS; add("Qu"); }
+QV { cv = CONS; add("QU"); }
+
+// febrvarius --> februarius
+// curva: unchanged
+{LR} [vV] {
+ switch(cv) {
+ case CONS: add(yytext().replace("v", "u").replace("V", "U")); break;
+ default: cv = CONS; add(yytext()); break;
+ }
+ }
+
+// februarivs --> februarius
+v / {lb} {Cons} { cv = CONS; add("u"); }
+V / {lb} {Cons} { cv = CONS; add("U"); }
+
+// 3.3 override default rule for .
+
+{Vowel} { cv = VOWEL; add(yytext()); }
+{Cons} { cv = CONS; add(yytext()); }
+[yY] { cv = 0; add(yytext()); }
+
+@ { problem = 1; cv = 0; add(yytext()); }
+{LB} { add(yytext()); }
+. { problem = 1; cv = 0; add(yytext()); } // in particular from Arboreal: "〈" (2329), "〉" (232A), Ç, ç
+
+}
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+
+TO DO:
+
+LA: Nochmal überlegen, ob man Ææęàèòùœ in der Vokal-Klasse weglassen kann. Sie schaden aber auch nicht. (Oder doch !?) Unterscheide Vokal-Klassen vor und nach dem u ?
+LA: Diakritika nochmal mit Paul durchgehen
+LA: Die Disambiguierungen durch die Diakritika fehlen noch.
+LA: ist J wirklich ein Problemfall?
+LA: gibt es Wörter wie super-rv... oder super-lv... in Klein- oder Großbuchstaben?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,589 @@
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+
+/*
+ * Normalization rules for Dutch text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 21.07.11 11:22 from the specification file
+ * MpdlNormalizerLexNL.lex
+ */
+public class MpdlNormalizerLexNL {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\3\25\0\1\2\14\0\1\1\2\0\1\1\17\0\1\5"+
+ "\40\0\1\1\2\0\1\1\20\0\1\1\5\0\1\1\1\0\1\1"+
+ "\u0101\0\1\4\ufe80\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\4\0\2\1\1\2\1\3\1\4\1\5\1\6";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\6\0\14\0\22\0\30\0\36\0\30\0\30"+
+ "\0\30\0\30\0\30";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\5\1\6\1\5\1\0\1\5\1\7\1\5\1\6"+
+ "\1\5\1\10\1\11\1\7\1\5\1\6\1\5\1\12"+
+ "\1\11\1\7\1\5\1\6\1\5\1\13\1\11\1\7"+
+ "\10\0\1\5\3\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[36];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\4\0\1\11\1\1\5\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[11];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexNL(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexNL(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 46) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 5:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+ case 7: break;
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 8: break;
+ case 4:
+ { add("s");
+ }
+ case 9: break;
+ case 3:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 10: break;
+ case 6:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+ case 11: break;
+ case 1:
+ { add(yytext());
+ }
+ case 12: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexNL.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,99 @@
+/*
+ * Normalization rules for Dutch text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexNL
+%type java.lang.String
+%unicode
+
+// Dutch: nl
+
+%states DISP, DICT, SEARCH
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+%%
+
+ {
+
+ſ { add("s"); }
+
+}
+
+
+// default
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird wortweise aufgerufen, mit einem \n am Ende des Strings
+- Zeilenumbrüche innerhalb des Wortes sind als hyphen/soft hyphen plus space markiert
+
+TO DO:
+
+NL: vollständig?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexTemplate.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,89 @@
+/*
+ * Template for normalization rules
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-07-12
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexTemplate
+%type java.lang.String
+%unicode
+
+// Language: list of ISO codes
+
+%states DISP, DICT, SEARCH
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+ private static final String LB = "[\u002d\u00ad] ";
+%}
+
+hyphen = [-\u{00ad}] // hyphen and soft hyphen
+LB = {hyphen} \u0020
+// lb = ({hyphen} \u0020)?
+
+END = \n
+
+%%
+
+ {
+
+ſ { add("s"); } // sample rule
+
+}
+
+
+// default rules
+
+@ { problem = 1; add(yytext()); }
+{LB} { add(yytext()); }
+. { add(yytext()); }
+
+
+// at the end, determine which string to return
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized.replaceAll(LB, "");
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized.replaceAll(LB, "").toLowerCase();
+ }
+ }
+}
+
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,637 @@
+/* The following code was generated by JFlex 1.4.3 on 21.07.11 11:22 */
+
+/*
+ * Normalization rules for Chinese text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-02-28
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.text.norm.lang;
+
+
+/**
+ * This class is a scanner generated by
+ * JFlex 1.4.3
+ * on 21.07.11 11:22 from the specification file
+ * MpdlNormalizerLexZH.lex
+ */
+public class MpdlNormalizerLexZH {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int SEARCH = 6;
+ public static final int DICT = 4;
+ public static final int YYINITIAL = 0;
+ public static final int DISP = 2;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private static final int ZZ_LEXSTATE[] = {
+ 0, 0, 1, 1, 2, 2, 3, 3
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\12\0\1\2\45\0\1\1\1\0\1\1\15\0\1\20\41\0\1\1"+
+ "\22\0\1\1\5\0\1\1\1\0\1\1\u4f84\0\1\3\176\0\1\4"+
+ "\u035a\0\1\4\u0a9a\0\1\6\u0781\0\1\10\u057a\0\1\11\u06bd\0\1\12"+
+ "\15\0\1\7\u0891\0\1\5\u1baf\0\1\13\340\0\1\14\u411a\0\1\16"+
+ "\u040e\0\1\17\u1d8f\0\1\15\u05e2\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\4\0\1\1\1\2\1\3\1\4\1\5\1\6\1\7"+
+ "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\1"+
+ "\1\17\1\20\1\21";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\21\0\42\0\63\0\104\0\104\0\104\0\104"+
+ "\0\104\0\104\0\104\0\104\0\104\0\104\0\104\0\104"+
+ "\0\104\0\104\0\125\0\104\0\104\0\104";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\2\5\1\0\15\5\1\6\2\5\1\7\1\10\1\11"+
+ "\1\12\1\13\1\14\1\15\1\16\1\17\1\20\1\21"+
+ "\1\22\1\23\1\5\1\6\1\5\1\24\1\25\1\10"+
+ "\1\11\1\12\1\13\1\14\1\15\1\16\1\17\1\20"+
+ "\1\21\1\22\1\23\1\5\1\6\1\5\1\24\1\7"+
+ "\1\10\1\11\1\12\1\13\1\14\1\15\1\16\1\17"+
+ "\1\20\1\21\1\22\1\23\1\5\1\6\40\0\1\26"+
+ "\1\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[102];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state aState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\4\0\16\11\1\1\3\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[22];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private boolean zzEOFDone;
+
+ /* user code: */
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ public MpdlNormalizerLexZH(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ public MpdlNormalizerLexZH(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 90) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return false
, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead > 0) {
+ zzEndRead+= numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0) {
+ int c = zzReader.read();
+ if (c == -1) {
+ return true;
+ } else {
+ zzBuffer[zzEndRead++] = (char) c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public java.lang.String yylex() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 17:
+ { add("庶");
+ }
+ case 18: break;
+ case 9:
+ { add("時");
+ }
+ case 19: break;
+ case 2:
+ { problem = 1; add(yytext());
+ }
+ case 20: break;
+ case 3:
+ { switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+ case 21: break;
+ case 10:
+ { add("歷");
+ }
+ case 22: break;
+ case 13:
+ { add("面");
+ }
+ case 23: break;
+ case 14:
+ { add("精");
+ }
+ case 24: break;
+ case 12:
+ { add("陰");
+ }
+ case 25: break;
+ case 8:
+ { add("床");
+ }
+ case 26: break;
+ case 1:
+ { add(yytext());
+ }
+ case 27: break;
+ case 15:
+ { add("");
+ }
+ case 28: break;
+ case 7:
+ { add("并");
+ }
+ case 29: break;
+ case 4:
+ { add("併");
+ }
+ case 30: break;
+ case 11:
+ { add("為");
+ }
+ case 31: break;
+ case 6:
+ { add("奇");
+ }
+ case 32: break;
+ case 5:
+ { add("叟");
+ }
+ case 33: break;
+ case 16:
+ { switch (problem) {
+ case 1: return "";
+ default: return normalized;
+ }
+ }
+ case 34: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return null;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/norm/lang/MpdlNormalizerLexZH.lex Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,120 @@
+/*
+ * Normalization rules for Chinese text
+ * [this is a JFlex specification]
+ *
+ * Wolfgang Schmidle
+ * version 2011-02-28
+ *
+ */
+
+package de.mpg.mpiwg.berlin.mpdl.lt.analyzer.lang;
+
+%%
+
+%public
+%class MpdlNormalizerLexZH
+%type java.lang.String
+%unicode
+
+// classical Chinese: zh, zho, zho-Hant
+
+%states DISP, DICT, SEARCH
+
+%{
+ private String original = "";
+ private String normalized = "";
+ private int problem = 0;
+
+ private void add (String norm) {
+ original += yytext();
+ normalized += norm;
+ }
+%}
+
+ZWS = [\u{200b}]
+
+END = \n
+
+%%
+
+// Normalization in Chinese means that character variants will be replaced by their standard characters
+// if there is no doubt about what the standard character is.
+
+// The input is supposed to be a single Chinese character, but strings of characters are also handled correctly.
+
+ {
+
+// Codepoint < FFFF
+
+倂 { add("併"); } // 5002 --> 4F75
+傁 | 叜 { add("叟"); } // 5081, 53DC --> 53DF
+竒 { add("奇"); } // 7AD2 --> 5947
+幷 { add("并"); } // 5E77 --> 5E76
+牀 { add("床"); } // 7240 --> 5E8A
+旹 { add("時"); } // 65F9 --> 6642
+歴 { add("歷"); } // 6B74 --> 6B77
+爲 { add("為"); } // 7232 --> 70BA
+隂 { add("陰"); } // 9682 --> 9670
+靣 { add("面"); } // 9763 --> 9762
+精 { add("精"); } // FA1D --> 7CBE (FA1D is a compatibility ideograph)
+
+// Codepoint > FFFF
+
+// note that [ABC] is not equivalent to A | B | C for codepoints above FFFF due to their internal encoding:
+// for example, 庶 (U+2F88D) is represented as a sequence of two codepoints: D87E DC8D
+// i.e. never use [ABC] but A | B | C
+
+庶 { add("庶"); } // 2F88D --> 5EB6 (2F88D is a compatibility ideograph)
+
+}
+
+ {
+
+// remove Zero Width Space (if there is any in the the input string)
+
+{ZWS} { add(""); }
+
+}
+
+// default
+
+@ { problem = 1; add(yytext()); }
+. { add(yytext()); }
+
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return original;
+ default: return normalized;
+ }
+ }
+}
+
+ {
+
+{END} {
+ switch (problem) {
+ case 1: return "";
+ default: return normalized;
+ }
+ }
+}
+
+
+/*
+
+Annahmen:
+- die Routine wird zeichenweise (oder mit mehr als einem Zeichen) aufgerufen, mit einem \n am Ende des Strings
+- es gibt keine Zeilenumbrüche
+
+TO DO:
+
+ZH: Liste ergänzen
+ZH: was ist, wenn man wirklich die Variante, die im Text steht, nachschlagen will? Dann muss man das Zeichen wohl selbst rauskopieren.
+ZH: sollen lateinische Buchstaben bewirken, dass problem = 1 ist?
+ZH: sollen Zeilenumbrüche rausgenommen werden, auch wenn sie in korrekt markiertem Text nicht vorkommen?
+ZH: was ist, wenn beijing übergeben wird und einen Zeilenumbruch enthält? Verlässt sich der Wrapper darauf, dass die Zeichenzahl gleich bleibt, oder macht er ein hyphen rein? was macht oder ?
+
+*/
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DBRegularizationHandler.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,146 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.reg;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+
+import com.sleepycat.je.Cursor;
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseEntry;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.LockMode;
+import com.sleepycat.je.OperationStatus;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+
+public class DBRegularizationHandler {
+ private String dbDirectory;
+ private DbEnvRegularization regDbEnv;
+
+ public DBRegularizationHandler(String dbDir) {
+ this.dbDirectory = dbDir;
+ }
+
+ public void start() throws ApplicationException {
+ regDbEnv = new DbEnvRegularization();
+ regDbEnv.setDataDir(dbDirectory);
+ regDbEnv.init(); // open databases in read/write mode
+ }
+
+ public void openDatabases() throws ApplicationException {
+ regDbEnv.openDatabases();
+ }
+
+ public void closeDatabases() throws ApplicationException {
+ regDbEnv.close();
+ }
+
+ public void deleteData() throws ApplicationException {
+ regDbEnv.removeDatabases();
+ }
+
+ public void writeOrigReg(Regularization reg) throws ApplicationException {
+ try {
+ String language = Language.getInstance().getLanguageId(reg.getLanguage());
+ String keyStr = language + "###" + reg.getOrig();
+ String valueStr = reg.getXmlString();
+ DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+ DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8"));
+ Database origDB = regDbEnv.getOrigDB();
+ origDB.put(null, dbEntryKey, dbEntryValue);
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ } catch (UnsupportedEncodingException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public void writeNormReg(Regularization reg) throws ApplicationException {
+ try {
+ String language = Language.getInstance().getLanguageId(reg.getLanguage());
+ String keyStr = language + "###" + reg.getNorm();
+ String valueStr = reg.getXmlString();
+ DatabaseEntry dbEntryKey = new DatabaseEntry(keyStr.getBytes("utf-8"));
+ DatabaseEntry dbEntryValue = new DatabaseEntry(valueStr.getBytes("utf-8"));
+ Database normDB = regDbEnv.getNormDB();
+ normDB.put(null, dbEntryKey, dbEntryValue);
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ } catch (UnsupportedEncodingException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public void deleteReg(Regularization reg) throws ApplicationException {
+ try {
+ String language = Language.getInstance().getLanguageId(reg.getLanguage());
+ String keyStrOrig = language + "###" + reg.getOrig();
+ DatabaseEntry dbEntryKey = new DatabaseEntry(keyStrOrig.getBytes("utf-8"));
+ Database origDB = regDbEnv.getOrigDB();
+ origDB.delete(null, dbEntryKey);
+ String keyStrNorm = reg.getLanguage() + "###" + reg.getNorm();
+ dbEntryKey = new DatabaseEntry(keyStrNorm.getBytes("utf-8"));
+ Database normDB = regDbEnv.getNormDB();
+ normDB.delete(null, dbEntryKey);
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ } catch (UnsupportedEncodingException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public ArrayList readRegsByOrig(String lang, String orig) throws ApplicationException {
+ String language = Language.getInstance().getLanguageId(lang);
+ ArrayList retRegs = new ArrayList();
+ String hashKey = language + "###" + orig;
+ try {
+ Database origDB = regDbEnv.getOrigDB();
+ Cursor cursor = origDB.openCursor(null, null);
+ byte[] bHashKey = hashKey.getBytes("utf-8");
+ DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+ DatabaseEntry foundValue = new DatabaseEntry();
+ OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+ while (operationStatus == OperationStatus.SUCCESS) {
+ byte[] foundValueBytes = foundValue.getData();
+ String foundValueStr = new String(foundValueBytes, "utf-8");
+ Regularization reg = Regularization.getInstance(foundValueStr);
+ retRegs.add(reg);
+ operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+ }
+ cursor.close();
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ } catch (UnsupportedEncodingException e) {
+ throw new ApplicationException(e);
+ }
+ return retRegs;
+ }
+
+ public ArrayList readRegsByNorm(String lang, String norm) throws ApplicationException {
+ String language = Language.getInstance().getLanguageId(lang);
+ ArrayList retRegs = new ArrayList();
+ String hashKey = language + "###" + norm;
+ try {
+ Database normDB = regDbEnv.getNormDB();
+ Cursor cursor = normDB.openCursor(null, null);
+ byte[] bHashKey = hashKey.getBytes("utf-8");
+ DatabaseEntry dbEntryKey = new DatabaseEntry(bHashKey);
+ DatabaseEntry foundValue = new DatabaseEntry();
+ OperationStatus operationStatus = cursor.getSearchKey(dbEntryKey, foundValue, LockMode.DEFAULT);
+ while (operationStatus == OperationStatus.SUCCESS) {
+ byte[] foundValueBytes = foundValue.getData();
+ String foundValueStr = new String(foundValueBytes, "utf-8");
+ Regularization reg = Regularization.getInstance(foundValueStr);
+ retRegs.add(reg);
+ operationStatus = cursor.getNextDup(dbEntryKey, foundValue, LockMode.DEFAULT);
+ }
+ cursor.close();
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ } catch (UnsupportedEncodingException e) {
+ throw new ApplicationException(e);
+ }
+ return retRegs;
+ }
+
+}
\ No newline at end of file
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/DbEnvRegularization.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,100 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.reg;
+
+import java.io.File;
+
+import com.sleepycat.je.Database;
+import com.sleepycat.je.DatabaseConfig;
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.Environment;
+import com.sleepycat.je.EnvironmentConfig;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class DbEnvRegularization {
+ private String dataDir;
+ private File envPath;
+ private Environment env;
+ private EnvironmentConfig envConfig;
+ private DatabaseConfig dbConfig;
+ private Database origDB;
+ private Database normDB;
+
+ public DbEnvRegularization() {
+ }
+
+ public void setDataDir(String dataDir) {
+ this.dataDir = dataDir;
+ }
+
+ public void init() throws ApplicationException {
+ try {
+ envConfig = new EnvironmentConfig();
+ dbConfig = new DatabaseConfig();
+ envConfig.setReadOnly(false);
+ dbConfig.setReadOnly(false);
+ envConfig.setAllowCreate(true);
+ dbConfig.setAllowCreate(true);
+ envConfig.setTransactional(true);
+ dbConfig.setTransactional(true);
+ // allow duplicates for keys
+ dbConfig.setSortedDuplicates(true);
+ envPath = new File(dataDir);
+ env = new Environment(envPath, envConfig);
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public void openDatabases() throws ApplicationException {
+ try {
+ // open databases (and create them if they do not exist)
+ origDB = env.openDatabase(null, "OrigDB", dbConfig);
+ normDB = env.openDatabase(null, "NormDB", dbConfig);
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public void removeDatabases() throws ApplicationException {
+ try {
+ if (origDB != null)
+ origDB.close();
+ if (normDB != null)
+ normDB.close();
+ env.removeDatabase(null, "OrigDB");
+ env.removeDatabase(null, "NormDB");
+ origDB = null;
+ normDB = null;
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ }
+ }
+
+ public Environment getEnv() {
+ return env;
+ }
+
+ public Database getNormDB() {
+ return normDB;
+ }
+
+ public Database getOrigDB() {
+ return origDB;
+ }
+
+ public void close() throws ApplicationException {
+ if (env != null) {
+ try {
+ if (origDB != null)
+ origDB.close();
+ if (normDB != null)
+ normDB.close();
+ if (env != null)
+ env.close();
+ } catch (DatabaseException e) {
+ throw new ApplicationException(e);
+ }
+ }
+ }
+}
+
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/Regularization.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,89 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.reg;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
+import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
+
+public class Regularization {
+ private String language;
+ private String orig;
+ private String norm;
+ private String source;
+ private int sourcePosition;
+
+ public Regularization(String language, String orig, String norm, String source) {
+ this.language = language;
+ this.orig = orig;
+ this.norm = norm;
+ this.source = source;
+ }
+
+ public static Regularization getInstance(String xmlStr) throws ApplicationException {
+ XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
+ String language = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//language");
+ String orig = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//orig");
+ String norm = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//norm");
+ String source = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source");
+ String sourcePosStr = xQueryEvaluator.evaluateAsStringValue(xmlStr, "//source/@position");
+ int sourcePos = new Integer(sourcePosStr);
+ Regularization reg = new Regularization(language, orig, norm, source);
+ reg.setSourcePosition(sourcePos);
+ return reg;
+ }
+
+ public String getLanguage() {
+ return language;
+ }
+
+ public void setLanguage(String language) {
+ this.language = language;
+ }
+
+ public String getOrig() {
+ return orig;
+ }
+
+ public void setOrig(String orig) {
+ this.orig = orig;
+ }
+
+ public String getNorm() {
+ return norm;
+ }
+
+ public void setNorm(String norm) {
+ this.norm = norm;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public int getSourcePosition() {
+ return sourcePosition;
+ }
+
+ public void setSourcePosition(int sourcePosition) {
+ this.sourcePosition = sourcePosition;
+ }
+
+ public String getXmlString() {
+ String xmlString = "\n";
+ if (language != null)
+ xmlString += " " + language + "\n";
+ if (orig != null)
+ xmlString += " " + StringUtils.deresolveXmlEntities(orig) + "\n";
+ if (norm != null)
+ xmlString += " " + StringUtils.deresolveXmlEntities(norm) + "\n";
+ if (source != null)
+ xmlString += " \n";
+ xmlString += "\n";
+ return xmlString;
+ }
+
+
+}
diff -r dc5e9fcb3fdc -r 4a3641ae14d2 software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/reg/RegularizationManager.java Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,118 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.reg;
+
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Hashtable;
+import java.util.logging.Logger;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Constants;
+import de.mpg.mpiwg.berlin.mpdl.lucene.util.LuceneUtil;
+import de.mpg.mpiwg.berlin.mpdl.util.Util;
+
+public class RegularizationManager {
+ private static RegularizationManager instance;
+ private static String DATA_DIR = Constants.getInstance().getDataDir();
+ private static String REGULARIZATION_DB_DIR = DATA_DIR + "/dataBerkeleyDB/regularization";
+ private static Logger LOGGER = Logger.getLogger(RegularizationManager.class.getName());
+ private DBRegularizationHandler dbRegHandler;
+ private Hashtable> regsOrig;
+ private Hashtable> regsNorm;
+ private Date beginOfOperation;
+ private Date endOfOperation;
+
+ public static RegularizationManager getInstance() throws ApplicationException {
+ if (instance == null) {
+ instance = new RegularizationManager();
+ instance.init();
+ }
+ return instance;
+ }
+
+ public static void main(String[] args) throws ApplicationException {
+ getInstance();
+ instance.beginOperation();
+ System.out.print("Start ...");
+ ArrayList regs = instance.findRegsByNorm("la", "Illiusque");
+ ArrayList regs2 = instance.findRegsByNorm("la", "Itaque");
+ Regularization bla = regs.get(0);
+ Regularization bla2 = regs2.get(0);
+
+ instance.end();
+ instance.endOperation();
+ Double elapsedTime = new Util().getSecondWithMillisecondsBetween(instance.beginOfOperation, instance.endOfOperation);
+ System.out.println("End.");
+ System.out.println("Needed time: " + elapsedTime + " seconds");
+ }
+
+ private void init() throws ApplicationException {
+ regsOrig = new Hashtable>();
+ regsNorm = new Hashtable