Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | ad505ef703ed |
children | 2c01cdc9b34a |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** * Convert ISMI transliteration into LOC romanization/transcription. * * See document: translit-to-romanization-2.0.doc by Chantal Wahbi * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf * * @author cwahbi, jurzua, casties * */ public class RomanizationLoC { /* * rule 1 */ private static Map<String, String> rule1_map = new HashMap<String, String>(); static{ //rules 1.a to 1.f rule1_map.put("\u1E6F", "\u0074\u0068");//ṯ -> th rule1_map.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th rule1_map.put("\u1E2B", "\u006B\u0068");//ḫ -> kh rule1_map.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh rule1_map.put("\u1E0F", "\u0064\u0068");//ḏ -> dh rule1_map.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh rule1_map.put("\u0161", "\u0073\u0068");//š -> sh rule1_map.put("\u0160", "\u0053\u0068");//Š -> Sh rule1_map.put("\u0121", "\u0067\u0068");//ġ -> gh rule1_map.put("\u0120", "\u0047\u0068");//Ġ -> Gh rule1_map.put("\u1EF3", "\u00E1");//ỳ -> á //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á } /* * rule 2 */ private static Pattern rule2a_pattern = Pattern.compile("\\b((?!al-)\\S+)ẗ(\\s+)(al-)"); private static Pattern rule2b_pattern = Pattern.compile("(\\S+)ẗ(\\s+|(?!al-)\\S*)"); private static Pattern rule2c_pattern = Pattern.compile("\\b(al-)(\\S+)ẗ\\b"); private static Pattern rule2d_pattern = Pattern.compile("(\\S+)ẗan\\b"); public static String ruleGroup2(String text) { /* * Rule 2c * * al-Xẗ => al-Xh */ Matcher matcher_c = rule2c_pattern.matcher(text); text = matcher_c.replaceAll("$1$2h"); /* * rule 2.d * * Xẗan -> Xtan */ Matcher matcher_d = rule2d_pattern.matcher(text); text = matcher_d.replaceAll("$1tan"); /* * rule 2a * * [Not beginnig with: al-] Xẗ al-X => Xt al-X */ Matcher matcher_a = rule2a_pattern.matcher(text); text = matcher_a.replaceAll("$1t$2$3"); /* * rule 2b * * Xẗ [Not followed by: al-X] => Xh */ Matcher matcher_b = rule2b_pattern.matcher(text); text = matcher_b.replaceAll("$1h$2"); return text; } /* * rule 3 */ //private static Pattern rule3a_pattern = Pattern.compile(begin_or_space + "(bi|wa|ka)(\\s+)(al-)(\\S+)"); private static Pattern rule3b_pattern = Pattern.compile("\\b(bi|wa|ka)(\\s+)(\\S+)"); public static String ruleGroup3(String text) { /* * rule 3.A * * P al-X; P=[ bi; wa; ka] => P-al-X */ /* rule 3a is subsumed by 3b Matcher matcher_a = rule3a_pattern.matcher(text); text = matcher_a.replaceAll("$1$2-$4"); */ /* * rule 3.B * * P X; P=[ bi; wa; ka] => P-X */ Matcher matcher_b = rule3b_pattern.matcher(text); text = matcher_b.replaceAll("$1-$3"); return text; } /* * rule 4 */ private static Pattern rule4a_pattern = Pattern.compile("\\b(li )(\\S+)"); private static Pattern rule4b_pattern = Pattern.compile("\\b(li al-|liʾl-|li-ʾl-|li-l-)(\\S+)"); public static String ruleGroup4(String text){ /* * rule 4.B * * [li al-X; li’l-X; li-’l-X; li-l-X] => lil-X */ Matcher matcher_b = rule4b_pattern.matcher(text); text = matcher_b.replaceAll("lil-$2"); /* * rule 4.A * * li X => li-X */ Matcher matcher_a = rule4a_pattern.matcher(text); text = matcher_a.replaceAll("li-$2"); return text; } /* * rule 5 */ private static Pattern rule5a_pattern = Pattern.compile("(-?ʾl-)(\\S+)"); private static Pattern rule5b_pattern = Pattern.compile("\\b(a|A)(t-(t)|ṯ-(ṯ)|d-(d)|ḏ-(ḏ)|r-(r)|z-(z)|s-(s)|š-(š)|ṣ-(ṣ)|ḍ-(ḍ)|ṭ-(ṭ)|ẓ-(ẓ)|l-(l)|n-(n))(\\S+)"); public static String ruleGroup5(String text){ /* * rule 5a * * [’l-X; X-’l-X] => al-X */ Matcher matcher_a = rule5a_pattern.matcher(text); text = matcher_a.replaceAll(" al-$2"); /* * rule 5b * * aY-YX; Y=Sun letters[t;ṯ;d;ḏ;r;z;s;š;ṣ;ḍ;ṭ;ẓ;l;n] => al-YX */ Matcher matcher_b = rule5b_pattern.matcher(text); // the groups 3-16 will be empty except the real match text = matcher_b.replaceAll("$1l-$3$4$5$6$7$8$9$10$11$12$13$14$15$16$17"); return text; } /* * rule 6 currently unused * * λh; λ= [t; k; d; s; g] => λʹh */ private static Pattern rule6_pattern = Pattern.compile("(\\S+)(t|k|d|s|g)h(\\S+)"); //private static Pattern rule6_pattern = Pattern.compile("([\\w|ā|ī|’|ā|š]*)(t|k|d|s|g)h([\\w|ā|ī|’|ā|š]*)"); public static String ruleGroup6(String text) { Matcher matcher = rule6_pattern.matcher(text); text = matcher.replaceAll("$1$2ʹh$3"); return text; } /* * rule 7 currently unused * * X[illāh; ullāh; allah; allāh; - Allāh; Allah; ullah] => X Allāh * * [ l; b; bism]illāh => [lillāh; billāh; bismillāh] (stay unchanged) */ public static String ruleGroup7(String text){ String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah"; int count = 0; while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){ text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4"); count++; } return text; } /** * Convert (ISMI-) transliterated arabic text into (LoC romanized) transcribed text. * * @param text * @return */ public static String convert(final String text) { if (StringUtils.isEmpty(text)) return text; // make sure we have composed unicode String romanizedText = NormalizerUtils.unicodeNormalize(text); // make sure we have standard ayn and hamza romanizedText = NormalizerUtils.aynHamzaNormalizer(romanizedText); romanizedText = ruleGroup2(romanizedText); romanizedText = ruleGroup4(romanizedText); romanizedText = ruleGroup5(romanizedText); //romanizedText = ruleGroup6(romanizedText); romanizedText = ruleGroup3(romanizedText); // replacementText = ruleGroup7(replacementText); // rule 1 for (Entry<String, String> tr : rule1_map.entrySet()) { if (romanizedText.contains(tr.getKey())) { romanizedText = romanizedText.replace(tr.getKey(), tr.getValue()); } } return romanizedText; } }