Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/OldRomanizationLoC.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | ad505ef703ed |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** * See document: romanized_arabic_into_arabic.doc * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf * @author jurzua * */ public class OldRomanizationLoC { private static boolean debug = false; private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>(); // \w = A word character: [a-zA-Z_0-9] private static String T = "ẗ"; private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+"; private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*"; private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n"; private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n"; private static String begin_space = "(^|.*\\s)"; private static String begin_space0 = "^|.*\\s"; static{ char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'}; //rules 1.a to 1.f CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á } public static char APOSTROPHE = 0x27; public static String apostrophesNormalization(String text){ String result = text; for(Character apostrophe : OldNormalizerUtils.apostrophes){ result = result.replace(apostrophe, APOSTROPHE); } return result; } public static char a = 0x61; public static String aNormalization(String text){ String result = text; for(Character item : OldNormalizerUtils.AList){ result = result.replace(item, a); } return result; } public static String ruleGroup7(String text){ String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah"; int count = 0; while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){ if(debug)System.out.println("ruleGroup7"); text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4"); if(debug) System.out.println(text); count++; } return text; } public static String ruleGroup6(String text){ String rule_6_consonants = "t|k|d|s|g"; if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){ text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3"); } return text; } public static String ruleGroup5(String text){ //wa-ʾl-nahār //wa al-nahār //5A while(text.matches("(.*)(-ʾl-)(.*)")){ if(debug)System.out.println("5A(a)"); text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3"); if(debug) System.out.println(text); } /* while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){ if(debug)System.out.println("5A"); text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4"); }*/ //5.B text = rule5B(text); return text; } public static String rule5B(String text){ //'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n' String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)"; String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); if(matcher.find()){ if(debug) System.out.println("5.B"); String g1 = matcher.group(1); String g2 = matcher.group(2); String g3 = matcher.group(3); String g4 = matcher.group(4); String g5 = matcher.group(5); text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5; if(debug) System.out.println(text); matcher = pattern.matcher(text); } return text; } public static String ruleGroup4(String text){ String gr_4b = "li al-|li’l-|li-’l-|li-l-"; //4.B while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){ if(debug) System.out.println("4.B"); text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3"); } //4.A while(text.matches(begin_space + "(li )(.*)")){ if(debug) System.out.println("4.A"); text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3"); } return text; } public static String ruleGroup3(String text){ //3.A while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){ if(debug) System.out.println("3.A"); text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5"); //if(debug) System.out.println(text); } // 3.B while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){ if(debug)System.out.println("3.B"); text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4"); } return text; } public static String ruleGroup2(String text){ //2.C: al-XXXXẗ -> al-XXXXh while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){ if(debug) System.out.println("2.C"); //System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ")); text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5"); if(debug) System.out.println(text); } //Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){ if(debug) System.out.println("2.Other"); text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h"); if(debug) System.out.println(text); } //2.D XXXXẗan -> XXXXtan while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){ if(debug) System.out.println("2.D"); text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4"); if(debug) System.out.println(text); } //2A text = rule2A(text); //2B text = rule2B(text); return text; } public static String rule2B(String text){ String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); int count = 0; while(matcher.find() && count < 10){ if(debug) System.out.println("2.B"); String g1 = matcher.group(1); String g2 = matcher.group(2); String g3 = matcher.group(3); String g4 = matcher.group(4); text = g1 + "h" + g3 + g4; if(debug) System.out.println(text); matcher = pattern.matcher(text); count++; } return text; } public static String rule2A(String text){ //2.A //String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)"; //String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)"; String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); while(matcher.find()){ if(debug) System.out.println("2.A"); String g1 = matcher.group(1); String g2 = matcher.group(2); String g3 = matcher.group(3); String g4 = matcher.group(4); String g5 = matcher.group(5); String g6 = matcher.group(6); //System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6); text = g1 + g2 + "t" + g4 + g5 + g6; if(debug) System.out.println(text); matcher = pattern.matcher(text); } return text; } public static String convert(final String text) { if(StringUtils.isEmpty(text)) return text; String replacementText = new String(text); replacementText = ruleGroup2(replacementText); replacementText = ruleGroup4(replacementText); replacementText = ruleGroup6(replacementText); //6 must be executed before 5 replacementText = ruleGroup5(replacementText); replacementText = ruleGroup3(replacementText); replacementText = ruleGroup7(replacementText); for (String ar : CONVERSIONMAP.keySet()) { String lat = CONVERSIONMAP.get(ar); if (replacementText.contains(ar)) { replacementText = replacementText.replace(ar, lat); } } return replacementText; } public static void test(String s){ System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n"); } public static void main(String[] args){ // test("li’l-Shirbīnī"); // test("li-'l-Shirbīnī"); // test("’Abdullāh"); // test("’Abd allāh"); //test("ʿAli b. ʿAbdullah"); //test("ʿAbdullah"); //test("Risālaẗ"); //test("Risālaẗ fī"); //test("Risālaẗ fī qismaẗ"); //test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ"); //test("ʿAli b. ʿAbdullah"); //test("Yusuf b. ʿAbdullah"); //test("fī-'l-kitāb"); //test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm"); /* //Rules Group 2 test("al-risalaẗ"); test("risalaẗ al-kabir"); test("risalaẗ al-kabir"); test("risalaẗ al-kabiraẗ"); // ????? test("risalaẗ"); test("risalaẗan"); test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ"); //Rules Group 3 test("bi al-tamām̄"); test("wa al-kamāl"); test("bi tarīq"); //Group 4 test("li al-shirbini"); test("li’l-Shirbīnī"); test("li-’l-Shirbīnī"); test("li tajrīd"); //Group 5 test("aš-šams"); test("aḏ-ḏams"); test("fi’l-kitāb"); test("fi-’l-kitāb"); //Group 6 test("Adham"); test("shirbini"); test("shirazi"); //Group 7 test("’Abdullāh"); test("ʿAbdullah"); test("’Abd allāh"); test("ʿAli b. ʿAbdullah"); */ //test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); //test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad"); //test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ"); //char ch = 'Á'; //System.out.println(String.format("%04x", (int) ch)); test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab"); //test("wa-ʾl-nahār"); //test("li-l-ʿIlm"); //test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); //test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); //String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"; //test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"); //test("al-ʿAqīda as-silālajīya dfsdssdf"); //test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā"); /* String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"; String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); while(matcher.find()){ if(debug) System.out.println("2.A"); } */ /* //String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); while(matcher.find()){ System.out.println(matcher.groupCount()); System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4)); } */ } }