Mercurial > hg > openmind
annotate src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | ad505ef703ed |
children |
rev | line source |
---|---|
1 | 1 package org.mpi.openmind.repository.utils; |
2 | |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
3 import java.text.Normalizer; |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
4 import java.util.regex.Matcher; |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
5 import java.util.regex.Pattern; |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
6 |
1 | 7 public class NormalizerUtils { |
8 | |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
9 /** |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
10 * Returns String normalized for searching arabic or transliterated arabic. |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
11 * |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
12 * @param w |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
13 * @return |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
14 */ |
71
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
15 public static String normalize(String w) { |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
16 String atn = ArabicTranslitNormalizer.normalize(w); |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
17 String an = ArabicNormalizer.normalize(atn); |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
18 return an; |
1 | 19 } |
20 | |
71
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
21 /** |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
22 * Returns String normalized for searching arabic transliteration text. |
71
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
23 * |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
24 * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
25 * |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
26 * @param w |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
27 * @return |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
28 */ |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
29 public static String normalizeArabicTranslit(String w) { |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
30 return ArabicTranslitNormalizer.normalize(w); |
1 | 31 } |
32 | |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
33 /** |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
34 * Returns String normalized for searching arabic. |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
35 * |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
36 * The normalization consists in removing vowels and other diacritic marks. |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
37 * |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
38 * @param w |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
39 * @return |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
40 */ |
71
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
41 public static String normalizeArabic(String w) { |
aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
casties
parents:
17
diff
changeset
|
42 return ArabicNormalizer.normalize(w); |
1 | 43 } |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
44 |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
45 /** |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
46 * Returns String in Unicode normalization (NFC). |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
47 * |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
48 * @param text |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
49 * @return |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
50 */ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
51 public static String unicodeNormalize(String text) { |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
52 if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) { |
89
8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
88
diff
changeset
|
53 text = Normalizer.normalize(text, Normalizer.Form.NFC); |
88
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
54 } |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
55 return text; |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
56 } |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
57 |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
58 private static Pattern old_ayn_pattern = Pattern.compile("(\u2018|\u02BB)"); // ‘|ʻ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
59 private static String new_ayn = "\u02BF"; // ʿ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
60 private static Pattern old_hamza_pattern = Pattern.compile("(\u2019|\u02bc)"); // ’|ʼ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
61 private static String new_hamza = "\u02BE"; // ʾ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
62 |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
63 /** |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
64 * Normalize transliteration forms for ayn and hamza. |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
65 * |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
66 * @param text |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
67 * @return |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
68 */ |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
69 public static String aynHamzaNormalizer(String text) { |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
70 Matcher match_ayn = old_ayn_pattern.matcher(text); |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
71 text = match_ayn.replaceAll(new_ayn); |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
72 Matcher match_hamza = old_hamza_pattern.matcher(text); |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
73 text = match_hamza.replaceAll(new_hamza); |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
74 return text; |
ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
Robert Casties <casties@mpiwg-berlin.mpg.de>
parents:
71
diff
changeset
|
75 } |
1 | 76 } |