Mercurial > hg > openmind

package org.mpi.openmind.repository.utils;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

/**
 * Convert ISMI transliteration into LOC romanization/transcription.
 *
 * See document: translit-to-romanization-2.0.doc by Chantal Wahbi
 * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf
 *
 * @author cwahbi, jurzua, casties
 *
 */
public class RomanizationLoC {

	/*
	 * rule 1
	 */
    private static Map<String, String> rule1_map = new HashMap<String, String>();
    static{
        //rules 1.a to 1.f
        rule1_map.put("\u1E6F", "\u0074\u0068");//ṯ -> th
        rule1_map.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th

        rule1_map.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
        rule1_map.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh

        rule1_map.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
        rule1_map.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh

        rule1_map.put("\u0161", "\u0073\u0068");//š -> sh
        rule1_map.put("\u0160", "\u0053\u0068");//Š -> Sh

        rule1_map.put("\u0121", "\u0067\u0068");//ġ -> gh
        rule1_map.put("\u0120", "\u0047\u0068");//Ġ -> Gh

        rule1_map.put("\u1EF3", "\u00E1");//ỳ -> á
        //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á
    }


	/*
	 * rule 2
	 */
    private static Pattern rule2a_pattern = Pattern.compile("\\b((?!al-)\\S+)ẗ(\\s+)(al-)");
    private static Pattern rule2b_pattern = Pattern.compile("(\\S+)ẗ(\\s+|(?!al-)\\S*)");
    private static Pattern rule2c_pattern = Pattern.compile("\\b(al-)(\\S+)ẗ\\b");
    private static Pattern rule2d_pattern = Pattern.compile("(\\S+)ẗan\\b");

    public static String ruleGroup2(String text) {

        /*
         * Rule 2c
         *
         * al-Xẗ => al-Xh
         */
        Matcher matcher_c = rule2c_pattern.matcher(text);
        text = matcher_c.replaceAll("$1$2h");

        /*
         * rule 2.d
         *
         * Xẗan -> Xtan
         */
        Matcher matcher_d = rule2d_pattern.matcher(text);
        text = matcher_d.replaceAll("$1tan");

        /*
         * rule 2a
         *
         * [Not beginnig with: al-] Xẗ al-X => Xt al-X
         */
        Matcher matcher_a = rule2a_pattern.matcher(text);
        text = matcher_a.replaceAll("$1t$2$3");

        /*
         * rule 2b
         *
         * Xẗ [Not followed by: al-X] => Xh
         */
        Matcher matcher_b = rule2b_pattern.matcher(text);
        text = matcher_b.replaceAll("$1h$2");

        return text;
    }


    /*
     * rule 3
     */
    //private static Pattern rule3a_pattern = Pattern.compile(begin_or_space + "(bi|wa|ka)(\\s+)(al-)(\\S+)");
    private static Pattern rule3b_pattern = Pattern.compile("\\b(bi|wa|ka)(\\s+)(\\S+)");

    public static String ruleGroup3(String text) {

        /*
         * rule 3.A
         *
         * P al-X; P=[ bi; wa; ka] => P-al-X
         */
        /* rule 3a is subsumed by 3b
        Matcher matcher_a = rule3a_pattern.matcher(text);
        text = matcher_a.replaceAll("$1$2-$4");
        */

        /*
         * rule 3.B
         *
         * P X; P=[ bi; wa; ka] => P-X
         */
        Matcher matcher_b = rule3b_pattern.matcher(text);
        text = matcher_b.replaceAll("$1-$3");

        return text;
    }


    /*
     * rule 4
     */
    private static Pattern rule4a_pattern = Pattern.compile("\\b(li )(\\S+)");
    private static Pattern rule4b_pattern = Pattern.compile("\\b(li al-|liʾl-|li-ʾl-|li-l-)(\\S+)");

    public static String ruleGroup4(String text){

        /*
         * rule 4.B
         *
         * [li al-X; li’l-X; li-’l-X; li-l-X] => lil-X
         */
        Matcher matcher_b = rule4b_pattern.matcher(text);
        text = matcher_b.replaceAll("lil-$2");

        /*
         * rule 4.A
         *
         * li X => li-X
         */
        Matcher matcher_a = rule4a_pattern.matcher(text);
        text = matcher_a.replaceAll("li-$2");

        return text;
    }


    /*
     * rule 5
     */
    private static Pattern rule5a_pattern = Pattern.compile("(-?ʾl-)(\\S+)");
    private static Pattern rule5b_pattern = Pattern.compile("\\b(a|A)(t-(t)|ṯ-(ṯ)|d-(d)|ḏ-(ḏ)|r-(r)|z-(z)|s-(s)|š-(š)|ṣ-(ṣ)|ḍ-(ḍ)|ṭ-(ṭ)|ẓ-(ẓ)|l-(l)|n-(n))(\\S+)");

    public static String ruleGroup5(String text){

        /*
         * rule 5a
         *
         * [’l-X; X-’l-X] => al-X
         */
        Matcher matcher_a = rule5a_pattern.matcher(text);
        text = matcher_a.replaceAll(" al-$2");

        /*
         * rule 5b
         *
         * aY-YX; Y=Sun letters[t;ṯ;d;ḏ;r;z;s;š;ṣ;ḍ;ṭ;ẓ;l;n] => al-YX
         */
        Matcher matcher_b = rule5b_pattern.matcher(text);
        // the groups 3-16 will be empty except the real match
        text = matcher_b.replaceAll("$1l-$3$4$5$6$7$8$9$10$11$12$13$14$15$16$17");

        return text;
    }

    /*
	 * rule 6 currently unused
	 *
	 * λh; λ= [t; k; d; s; g] => λʹh
	 */
	private static Pattern rule6_pattern = Pattern.compile("(\\S+)(t|k|d|s|g)h(\\S+)");
	//private static Pattern rule6_pattern = Pattern.compile("([\\w|ā|ī|’|ā|š]*)(t|k|d|s|g)h([\\w|ā|ī|’|ā|š]*)");

    public static String ruleGroup6(String text) {
        Matcher matcher = rule6_pattern.matcher(text);
        text = matcher.replaceAll("$1$2ʹh$3");
        return text;
    }

    /*
     * rule 7 currently unused
     *
     * X[illāh; ullāh; allah; allāh; - Allāh; Allah; ullah] => X Allāh
     *
     * [ l; b; bism]illāh => [lillāh; billāh; bismillāh] (stay unchanged)
     */
    public static String ruleGroup7(String text){

        String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";

        int count = 0;
        while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
            text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
            count++;
        }

        return text;

    }


    /**
     * Convert (ISMI-) transliterated arabic text into (LoC romanized) transcribed text.
     *
     * @param text
     * @return
     */
    public static String convert(final String text) {

        if (StringUtils.isEmpty(text))
            return text;

        // make sure we have composed unicode
        String romanizedText = NormalizerUtils.unicodeNormalize(text);
        // make sure we have standard ayn and hamza
        romanizedText = NormalizerUtils.aynHamzaNormalizer(romanizedText);

        romanizedText = ruleGroup2(romanizedText);
        romanizedText = ruleGroup4(romanizedText);
        romanizedText = ruleGroup5(romanizedText);
        //romanizedText = ruleGroup6(romanizedText);
        romanizedText = ruleGroup3(romanizedText);
        // replacementText = ruleGroup7(replacementText);

        // rule 1
        for (Entry<String, String> tr : rule1_map.entrySet()) {
            if (romanizedText.contains(tr.getKey())) {
                romanizedText = romanizedText.replace(tr.getKey(), tr.getValue());
            }
        }

        return romanizedText;
    }

}
author	Robert Casties <casties@mpiwg-berlin.mpg.de>
date	Mon, 26 Feb 2018 14:39:49 +0100
parents	ad505ef703ed
children	2c01cdc9b34a