Mercurial > hg > openmind
changeset 88:ad505ef703ed
new implementation of translit-to-romanization rules in RomanizationLoc.
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 23 Feb 2018 21:43:29 +0100 |
parents | 8005f7011975 |
children | 8adfa8679991 |
files | src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java src/main/java/org/mpi/openmind/repository/utils/OldRomanizationLoC.java src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java |
diffstat | 3 files changed, 699 insertions(+), 400 deletions(-) [+] |
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java Tue Feb 06 15:14:29 2018 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java Fri Feb 23 21:43:29 2018 +0100 @@ -1,8 +1,17 @@ package org.mpi.openmind.repository.utils; +import java.text.Normalizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public class NormalizerUtils { - + /** + * Returns String normalized for searching arabic or transliterated arabic. + * + * @param w + * @return + */ public static String normalize(String w) { String atn = ArabicTranslitNormalizer.normalize(w); String an = ArabicNormalizer.normalize(atn); @@ -10,7 +19,7 @@ } /** - * Returns String normalized according to arabic transliteration rules. + * Returns String normalized for searching arabic transliteration text. * * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit * @@ -21,7 +30,47 @@ return ArabicTranslitNormalizer.normalize(w); } + /** + * Returns String normalized for searching arabic. + * + * The normalization consists in removing vowels and other diacritic marks. + * + * @param w + * @return + */ public static String normalizeArabic(String w) { return ArabicNormalizer.normalize(w); } + + /** + * Returns String in Unicode normalization (NFC). + * + * @param text + * @return + */ + public static String unicodeNormalize(String text) { + if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) { + Normalizer.normalize(text, Normalizer.Form.NFC); + } + return text; + } + + private static Pattern old_ayn_pattern = Pattern.compile("(\u2018|\u02BB)"); // ‘|ʻ + private static String new_ayn = "\u02BF"; // ʿ + private static Pattern old_hamza_pattern = Pattern.compile("(\u2019|\u02bc)"); // ’|ʼ + private static String new_hamza = "\u02BE"; // ʾ + + /** + * Normalize transliteration forms for ayn and hamza. + * + * @param text + * @return + */ + public static String aynHamzaNormalizer(String text) { + Matcher match_ayn = old_ayn_pattern.matcher(text); + text = match_ayn.replaceAll(new_ayn); + Matcher match_hamza = old_hamza_pattern.matcher(text); + text = match_hamza.replaceAll(new_hamza); + return text; + } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/mpi/openmind/repository/utils/OldRomanizationLoC.java Fri Feb 23 21:43:29 2018 +0100 @@ -0,0 +1,419 @@ +package org.mpi.openmind.repository.utils; + +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; + +/** + * See document: romanized_arabic_into_arabic.doc + * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf + * @author jurzua + * + */ +public class OldRomanizationLoC { + + private static boolean debug = false; + + + private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>(); + // \w = A word character: [a-zA-Z_0-9] + private static String T = "ẗ"; + private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+"; + private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*"; + private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n"; + private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n"; + private static String begin_space = "(^|.*\\s)"; + private static String begin_space0 = "^|.*\\s"; + + + static{ + + char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'}; + + + //rules 1.a to 1.f + CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th + CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th + + CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh + CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh + + CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh + CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh + + CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh + CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh + + CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh + CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh + + CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á + //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á + + } + + public static char APOSTROPHE = 0x27; + public static String apostrophesNormalization(String text){ + String result = text; + for(Character apostrophe : OldNormalizerUtils.apostrophes){ + result = result.replace(apostrophe, APOSTROPHE); + } + return result; + } + + public static char a = 0x61; + public static String aNormalization(String text){ + String result = text; + for(Character item : OldNormalizerUtils.AList){ + result = result.replace(item, a); + } + return result; + } + + + public static String ruleGroup7(String text){ + + String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah"; + + int count = 0; + while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){ + if(debug)System.out.println("ruleGroup7"); + text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4"); + if(debug) System.out.println(text); + count++; + } + + return text; + + } + + public static String ruleGroup6(String text){ + + String rule_6_consonants = "t|k|d|s|g"; + + if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){ + text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3"); + } + + return text; + } + + public static String ruleGroup5(String text){ + + //wa-ʾl-nahār + //wa al-nahār + //5A + while(text.matches("(.*)(-ʾl-)(.*)")){ + if(debug)System.out.println("5A(a)"); + text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3"); + if(debug) System.out.println(text); + } + + /* + while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){ + if(debug)System.out.println("5A"); + text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4"); + }*/ + + + //5.B + text = rule5B(text); + + return text; + } + + public static String rule5B(String text){ + //'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n' + + String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)"; + + String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(text); + if(matcher.find()){ + if(debug) System.out.println("5.B"); + String g1 = matcher.group(1); + String g2 = matcher.group(2); + String g3 = matcher.group(3); + String g4 = matcher.group(4); + String g5 = matcher.group(5); + + text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5; + + if(debug) System.out.println(text); + matcher = pattern.matcher(text); + } + + return text; + } + + + public static String ruleGroup4(String text){ + + String gr_4b = "li al-|li’l-|li-’l-|li-l-"; + //4.B + while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){ + if(debug) System.out.println("4.B"); + text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3"); + } + + //4.A + while(text.matches(begin_space + "(li )(.*)")){ + if(debug) System.out.println("4.A"); + text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3"); + } + + return text; + } + + + public static String ruleGroup3(String text){ + + //3.A + while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){ + if(debug) System.out.println("3.A"); + text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5"); + //if(debug) System.out.println(text); + } + + // 3.B + while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){ + if(debug)System.out.println("3.B"); + text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4"); + } + + return text; + } + + public static String ruleGroup2(String text){ + + //2.C: al-XXXXẗ -> al-XXXXh + while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){ + if(debug) System.out.println("2.C"); + //System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ")); + text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5"); + if(debug) System.out.println(text); + } + + //Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh + if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){ + if(debug) System.out.println("2.Other"); + text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h"); + if(debug) System.out.println(text); + } + + //2.D XXXXẗan -> XXXXtan + while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){ + if(debug) System.out.println("2.D"); + text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4"); + if(debug) System.out.println(text); + } + + //2A + text = rule2A(text); + + //2B + text = rule2B(text); + + return text; + } + + public static String rule2B(String text){ + + String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(text); + int count = 0; + while(matcher.find() && count < 10){ + if(debug) System.out.println("2.B"); + String g1 = matcher.group(1); + String g2 = matcher.group(2); + String g3 = matcher.group(3); + String g4 = matcher.group(4); + text = g1 + "h" + g3 + g4; + if(debug) System.out.println(text); + matcher = pattern.matcher(text); + count++; + } + return text; + } + + public static String rule2A(String text){ + + //2.A + //String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)"; + //String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)"; + String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(text); + + while(matcher.find()){ + + if(debug) System.out.println("2.A"); + + String g1 = matcher.group(1); + String g2 = matcher.group(2); + String g3 = matcher.group(3); + String g4 = matcher.group(4); + String g5 = matcher.group(5); + String g6 = matcher.group(6); + + + //System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6); + text = g1 + g2 + "t" + g4 + g5 + g6; + if(debug) System.out.println(text); + + matcher = pattern.matcher(text); + } + + return text; + } + + public static String convert(final String text) { + + if(StringUtils.isEmpty(text)) + return text; + + String replacementText = new String(text); + + replacementText = ruleGroup2(replacementText); + replacementText = ruleGroup4(replacementText); + replacementText = ruleGroup6(replacementText); //6 must be executed before 5 + replacementText = ruleGroup5(replacementText); + replacementText = ruleGroup3(replacementText); + replacementText = ruleGroup7(replacementText); + + for (String ar : CONVERSIONMAP.keySet()) { + String lat = CONVERSIONMAP.get(ar); + if (replacementText.contains(ar)) { + replacementText = replacementText.replace(ar, lat); + } + } + + return replacementText; + } + + public static void test(String s){ + System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n"); + + } + + + public static void main(String[] args){ + + +// test("li’l-Shirbīnī"); +// test("li-'l-Shirbīnī"); +// test("’Abdullāh"); +// test("’Abd allāh"); + + //test("ʿAli b. ʿAbdullah"); + //test("ʿAbdullah"); + //test("Risālaẗ"); + //test("Risālaẗ fī"); + //test("Risālaẗ fī qismaẗ"); + //test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ"); + + //test("ʿAli b. ʿAbdullah"); + //test("Yusuf b. ʿAbdullah"); + + + + //test("fī-'l-kitāb"); + + //test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm"); + /* + //Rules Group 2 + test("al-risalaẗ"); + test("risalaẗ al-kabir"); + test("risalaẗ al-kabir"); + test("risalaẗ al-kabiraẗ"); // ????? + test("risalaẗ"); + test("risalaẗan"); + test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ"); + + //Rules Group 3 + test("bi al-tamām̄"); + test("wa al-kamāl"); + test("bi tarīq"); +*/ + //Group 4 + test("li al-shirbini"); + test("li’l-Shirbīnī"); + test("li-’l-Shirbīnī"); + test("li tajrīd"); +/* + //Group 5 + test("aš-šams"); + test("aḏ-ḏams"); + test("fi’l-kitāb"); + test("fi-’l-kitāb"); + */ + + + //Group 6 + test("Adham"); + test("shirbini"); + test("shirazi"); + + /* + //Group 7 + test("’Abd allāh"); + + */ + + //test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); + + + //test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad"); + //test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ"); + + //char ch = 'Á'; + //System.out.println(String.format("%04x", (int) ch)); + //test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab"); + + //test("wa-ʾl-nahār"); + //test("li-l-ʿIlm"); + //test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); + //test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); + + //String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"; + + //test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"); + + + //test("al-ʿAqīda as-silālajīya dfsdssdf"); + //test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā"); + + + + + /* + String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"; + String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(text); + + while(matcher.find()){ + + if(debug) System.out.println("2.A"); + } + */ + /* + //String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; + String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(text); + + while(matcher.find()){ + System.out.println(matcher.groupCount()); + + System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4)); + } + */ + } + +}
--- a/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java Tue Feb 06 15:14:29 2018 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java Fri Feb 23 21:43:29 2018 +0100 @@ -2,417 +2,248 @@ import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** - * See document: romanized_arabic_into_arabic.doc + * Convert ISMI transliteration into LOC romanization/transcription. + * + * See document: translit-to-romanization-2.0.doc by Chantal Wahbi * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf - * @author jurzua + * + * @author cwahbi, jurzua, casties * */ public class RomanizationLoC { - private static boolean debug = false; - - - private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>(); - // \w = A word character: [a-zA-Z_0-9] - private static String T = "ẗ"; - private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+"; - private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*"; - private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n"; - private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n"; - private static String begin_space = "(^|.*\\s)"; - private static String begin_space0 = "^|.*\\s"; - - - static{ - - char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'}; - - - //rules 1.a to 1.f - CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th - CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th - - CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh - CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh - - CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh - CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh - - CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh - CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh - - CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh - CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh - - CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á - //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á + /* + * rule 1 + */ + private static Map<String, String> rule1_map = new HashMap<String, String>(); + static{ + //rules 1.a to 1.f + rule1_map.put("\u1E6F", "\u0074\u0068");//ṯ -> th + rule1_map.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th + + rule1_map.put("\u1E2B", "\u006B\u0068");//ḫ -> kh + rule1_map.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh + + rule1_map.put("\u1E0F", "\u0064\u0068");//ḏ -> dh + rule1_map.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh + + rule1_map.put("\u0161", "\u0073\u0068");//š -> sh + rule1_map.put("\u0160", "\u0053\u0068");//Š -> Sh + + rule1_map.put("\u0121", "\u0067\u0068");//ġ -> gh + rule1_map.put("\u0120", "\u0047\u0068");//Ġ -> Gh + + rule1_map.put("\u1EF3", "\u00E1");//ỳ -> á + //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á + } + + + /* + * rule 2 + */ + private static Pattern rule2a_pattern = Pattern.compile("\\b((?!al-)\\S+)ẗ(\\s+)(al-)"); + private static Pattern rule2b_pattern = Pattern.compile("(\\S+)ẗ(\\s+|(?!al-)\\S*)"); + private static Pattern rule2c_pattern = Pattern.compile("\\b(al-)(\\S+)ẗ\\b"); + private static Pattern rule2d_pattern = Pattern.compile("(\\S+)ẗan\\b"); + + public static String ruleGroup2(String text) { + + /* + * Rule 2c + * + * al-Xẗ => al-Xh + */ + Matcher matcher_c = rule2c_pattern.matcher(text); + text = matcher_c.replaceAll("$1$2h"); + + /* + * rule 2.d + * + * Xẗan -> Xtan + */ + Matcher matcher_d = rule2d_pattern.matcher(text); + text = matcher_d.replaceAll("$1tan"); + + /* + * rule 2a + * + * [Not beginnig with: al-] Xẗ al-X => Xt al-X + */ + Matcher matcher_a = rule2a_pattern.matcher(text); + text = matcher_a.replaceAll("$1t$2$3"); + + /* + * rule 2b + * + * Xẗ [Not followed by: al-X] => Xh + */ + Matcher matcher_b = rule2b_pattern.matcher(text); + text = matcher_b.replaceAll("$1h$2"); + + return text; + } + + + /* + * rule 3 + */ + //private static Pattern rule3a_pattern = Pattern.compile(begin_or_space + "(bi|wa|ka)(\\s+)(al-)(\\S+)"); + private static Pattern rule3b_pattern = Pattern.compile("\\b(bi|wa|ka)(\\s+)(\\S+)"); + + public static String ruleGroup3(String text) { - } - - public static char APOSTROPHE = 0x27; - public static String apostrophesNormalization(String text){ - String result = text; - for(Character apostrophe : OldNormalizerUtils.apostrophes){ - result = result.replace(apostrophe, APOSTROPHE); - } - return result; - } - - public static char a = 0x61; - public static String aNormalization(String text){ - String result = text; - for(Character item : OldNormalizerUtils.AList){ - result = result.replace(item, a); - } - return result; - } - - - public static String ruleGroup7(String text){ - - String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah"; - - int count = 0; - while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){ - if(debug)System.out.println("ruleGroup7"); - text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4"); - if(debug) System.out.println(text); - count++; - } - - return text; - - } - - public static String ruleGroup6(String text){ - - String rule_6_consonants = "t|k|d|s|g"; - - if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){ - text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3"); - } - - return text; - } + /* + * rule 3.A + * + * P al-X; P=[ bi; wa; ka] => P-al-X + */ + /* rule 3a is subsumed by 3b + Matcher matcher_a = rule3a_pattern.matcher(text); + text = matcher_a.replaceAll("$1$2-$4"); + */ + + /* + * rule 3.B + * + * P X; P=[ bi; wa; ka] => P-X + */ + Matcher matcher_b = rule3b_pattern.matcher(text); + text = matcher_b.replaceAll("$1-$3"); + + return text; + } + + + /* + * rule 4 + */ + private static Pattern rule4a_pattern = Pattern.compile("\\b(li )(\\S+)"); + private static Pattern rule4b_pattern = Pattern.compile("\\b(li al-|liʾl-|li-ʾl-|li-l-)(\\S+)"); + + public static String ruleGroup4(String text){ + + /* + * rule 4.B + * + * [li al-X; li’l-X; li-’l-X; li-l-X] => lil-X + */ + Matcher matcher_b = rule4b_pattern.matcher(text); + text = matcher_b.replaceAll("lil-$2"); + + /* + * rule 4.A + * + * li X => li-X + */ + Matcher matcher_a = rule4a_pattern.matcher(text); + text = matcher_a.replaceAll("li-$2"); + + return text; + } + + + /* + * rule 5 + */ + private static Pattern rule5a_pattern = Pattern.compile("(-?ʾl-)(\\S+)"); + private static Pattern rule5b_pattern = Pattern.compile("\\b(a|A)(t-(t)|ṯ-(ṯ)|d-(d)|ḏ-(ḏ)|r-(r)|z-(z)|s-(s)|š-(š)|ṣ-(ṣ)|ḍ-(ḍ)|ṭ-(ṭ)|ẓ-(ẓ)|l-(l)|n-(n))(\\S+)"); + + public static String ruleGroup5(String text){ + + /* + * rule 5a + * + * [’l-X; X-’l-X] => al-X + */ + Matcher matcher_a = rule5a_pattern.matcher(text); + text = matcher_a.replaceAll(" al-$2"); + + /* + * rule 5b + * + * aY-YX; Y=Sun letters[t;ṯ;d;ḏ;r;z;s;š;ṣ;ḍ;ṭ;ẓ;l;n] => al-YX + */ + Matcher matcher_b = rule5b_pattern.matcher(text); + // the groups 3-16 will be empty except the real match + text = matcher_b.replaceAll("$1l-$3$4$5$6$7$8$9$10$11$12$13$14$15$16$17"); + + return text; + } + + /* + * rule 6 + * + * λh; λ= [t; k; d; s; g] => λʹh + */ + private static Pattern rule6_pattern = Pattern.compile("(\\S+)(t|k|d|s|g)h(\\S+)"); + //private static Pattern rule6_pattern = Pattern.compile("([\\w|ā|ī|’|ā|š]*)(t|k|d|s|g)h([\\w|ā|ī|’|ā|š]*)"); - public static String ruleGroup5(String text){ - - //wa-ʾl-nahār - //wa al-nahār - //5A - while(text.matches("(.*)(-ʾl-)(.*)")){ - if(debug)System.out.println("5A(a)"); - text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3"); - if(debug) System.out.println(text); - } - - /* - while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){ - if(debug)System.out.println("5A"); - text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4"); - }*/ - - - //5.B - text = rule5B(text); - - return text; - } - - public static String rule5B(String text){ - //'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n' - - String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)"; - - String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(text); - if(matcher.find()){ - if(debug) System.out.println("5.B"); - String g1 = matcher.group(1); - String g2 = matcher.group(2); - String g3 = matcher.group(3); - String g4 = matcher.group(4); - String g5 = matcher.group(5); - - text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5; - - if(debug) System.out.println(text); - matcher = pattern.matcher(text); - } - - return text; - } - + public static String ruleGroup6(String text) { + Matcher matcher = rule6_pattern.matcher(text); + text = matcher.replaceAll("$1$2ʹh$3"); + return text; + } - public static String ruleGroup4(String text){ - - String gr_4b = "li al-|li’l-|li-’l-|li-l-"; - //4.B - while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){ - if(debug) System.out.println("4.B"); - text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3"); - } - - //4.A - while(text.matches(begin_space + "(li )(.*)")){ - if(debug) System.out.println("4.A"); - text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3"); - } - - return text; - } - - - public static String ruleGroup3(String text){ - - //3.A - while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){ - if(debug) System.out.println("3.A"); - text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5"); - //if(debug) System.out.println(text); - } - - // 3.B - while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){ - if(debug)System.out.println("3.B"); - text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4"); - } - - return text; - } - - public static String ruleGroup2(String text){ - - //2.C: al-XXXXẗ -> al-XXXXh - while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){ - if(debug) System.out.println("2.C"); - //System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ")); - text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5"); - if(debug) System.out.println(text); - } - - //Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh - if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){ - if(debug) System.out.println("2.Other"); - text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h"); - if(debug) System.out.println(text); - } - - //2.D XXXXẗan -> XXXXtan - while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){ - if(debug) System.out.println("2.D"); - text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4"); - if(debug) System.out.println(text); - } - - //2A - text = rule2A(text); + /* + * rule 7 currently unused + * + * X[illāh; ullāh; allah; allāh; - Allāh; Allah; ullah] => X Allāh + * + * [ l; b; bism]illāh => [lillāh; billāh; bismillāh] (stay unchanged) + */ + public static String ruleGroup7(String text){ + + String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah"; + + int count = 0; + while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){ + text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4"); + count++; + } + + return text; + + } + + + /** + * Convert (ISMI-) transliterated arabic text into (LoC romanized) transcribed text. + * + * @param text + * @return + */ + public static String convert(final String text) { + + if (StringUtils.isEmpty(text)) + return text; + + // make sure we have composed unicode + String romanizedText = NormalizerUtils.unicodeNormalize(text); + // make sure we have standard ayn and hamza + romanizedText = NormalizerUtils.aynHamzaNormalizer(romanizedText); + + romanizedText = ruleGroup2(romanizedText); + romanizedText = ruleGroup4(romanizedText); + romanizedText = ruleGroup5(romanizedText); + //romanizedText = ruleGroup6(romanizedText); + romanizedText = ruleGroup3(romanizedText); + // replacementText = ruleGroup7(replacementText); + + // rule 1 + for (Entry<String, String> tr : rule1_map.entrySet()) { + if (romanizedText.contains(tr.getKey())) { + romanizedText = romanizedText.replace(tr.getKey(), tr.getValue()); + } + } + + return romanizedText; + } - //2B - text = rule2B(text); - - return text; - } - - public static String rule2B(String text){ - - String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(text); - int count = 0; - while(matcher.find() && count < 10){ - if(debug) System.out.println("2.B"); - String g1 = matcher.group(1); - String g2 = matcher.group(2); - String g3 = matcher.group(3); - String g4 = matcher.group(4); - text = g1 + "h" + g3 + g4; - if(debug) System.out.println(text); - matcher = pattern.matcher(text); - count++; - } - return text; - } - - public static String rule2A(String text){ - - //2.A - //String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)"; - //String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)"; - String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(text); - - while(matcher.find()){ - - if(debug) System.out.println("2.A"); - - String g1 = matcher.group(1); - String g2 = matcher.group(2); - String g3 = matcher.group(3); - String g4 = matcher.group(4); - String g5 = matcher.group(5); - String g6 = matcher.group(6); - - - //System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6); - text = g1 + g2 + "t" + g4 + g5 + g6; - if(debug) System.out.println(text); - - matcher = pattern.matcher(text); - } - - return text; - } - - public static String convert(final String text) { - - if(StringUtils.isEmpty(text)) - return text; - - String replacementText = new String(text); - - replacementText = ruleGroup2(replacementText); - replacementText = ruleGroup4(replacementText); - replacementText = ruleGroup6(replacementText); //6 must be executed before 5 - replacementText = ruleGroup5(replacementText); - replacementText = ruleGroup3(replacementText); - replacementText = ruleGroup7(replacementText); - - for (String ar : CONVERSIONMAP.keySet()) { - String lat = CONVERSIONMAP.get(ar); - if (replacementText.contains(ar)) { - replacementText = replacementText.replace(ar, lat); - } - } - - return replacementText; - } - - public static void test(String s){ - System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n"); - - } - - - public static void main(String[] args){ - - - //test("li’l-Shirbīnī"); - //test("li-'l-Shirbīnī"); - //test("’Abdullāh"); - //test("’Abd allāh"); - - //test("ʿAli b. ʿAbdullah"); - //test("ʿAbdullah"); - //test("Risālaẗ"); - //test("Risālaẗ fī"); - //test("Risālaẗ fī qismaẗ"); - //test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ"); - - //test("ʿAli b. ʿAbdullah"); - //test("Yusuf b. ʿAbdullah"); - - - - //test("fī-'l-kitāb"); - - //test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm"); - - /* - //Rules Group 2 - test("al-risalaẗ"); - test("risalaẗ al-kabir"); - test("risalaẗ al-kabir"); - test("risalaẗ al-kabiraẗ"); // ????? - test("risalaẗ"); - test("risalaẗan"); - test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ"); - - //Rules Group 3 - test("bi al-tamām̄"); - test("wa al-kamāl"); - test("bi tarīq"); - - //Group 4 - test("li al-shirbini"); - test("li’l-Shirbīnī"); - test("li-’l-Shirbīnī"); - test("li tajrīd"); - - - //Group 5 - test("aš-šams"); - test("aḏ-ḏams"); - - - - //Group 6 - test("Adham"); - - //Group 7 - test("’Abd allāh"); - - test("fi’l-kitāb"); - test("fi-’l-kitāb"); - */ - - - - - //test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad"); - //test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ"); - - //char ch = 'Á'; - //System.out.println(String.format("%04x", (int) ch)); - //test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab"); - - //test("wa-ʾl-nahār"); - //test("li-l-ʿIlm"); - //test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); - //test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"); - - //String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ"; - - //test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"); - - - //test("al-ʿAqīda as-silālajīya dfsdssdf"); - test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā"); - - - - - /* - String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār"; - String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(text); - - while(matcher.find()){ - - if(debug) System.out.println("2.A"); - } - */ - /* - //String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)"; - String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(text); - - while(matcher.find()){ - System.out.println(matcher.groupCount()); - - System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4)); - } - */ - } - }