view src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java @ 89:8adfa8679991

new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Mon, 26 Feb 2018 14:39:49 +0100
parents ad505ef703ed
children 2c01cdc9b34a
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

/**
 * Convert ISMI transliteration into LOC romanization/transcription.
 * 
 * See document: translit-to-romanization-2.0.doc by Chantal Wahbi
 * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf
 * 
 * @author cwahbi, jurzua, casties
 *
 */
public class RomanizationLoC {

	/*
	 * rule 1
	 */
    private static Map<String, String> rule1_map = new HashMap<String, String>();
    static{
        //rules 1.a to 1.f
        rule1_map.put("\u1E6F", "\u0074\u0068");//ṯ -> th
        rule1_map.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th
        
        rule1_map.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
        rule1_map.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh
        
        rule1_map.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
        rule1_map.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh
        
        rule1_map.put("\u0161", "\u0073\u0068");//š -> sh
        rule1_map.put("\u0160", "\u0053\u0068");//Š -> Sh
        
        rule1_map.put("\u0121", "\u0067\u0068");//ġ -> gh
        rule1_map.put("\u0120", "\u0047\u0068");//Ġ -> Gh
        
        rule1_map.put("\u1EF3", "\u00E1");//ỳ -> á
        //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á
    }
    

	/*
	 * rule 2
	 */
    private static Pattern rule2a_pattern = Pattern.compile("\\b((?!al-)\\S+)ẗ(\\s+)(al-)");
    private static Pattern rule2b_pattern = Pattern.compile("(\\S+)ẗ(\\s+|(?!al-)\\S*)");
    private static Pattern rule2c_pattern = Pattern.compile("\\b(al-)(\\S+)ẗ\\b");
    private static Pattern rule2d_pattern = Pattern.compile("(\\S+)ẗan\\b");
    
    public static String ruleGroup2(String text) {

        /*
         * Rule 2c 
         * 
         * al-Xẗ => al-Xh
         */
        Matcher matcher_c = rule2c_pattern.matcher(text);
        text = matcher_c.replaceAll("$1$2h");

        /*
         * rule 2.d
         * 
         * Xẗan -> Xtan
         */
        Matcher matcher_d = rule2d_pattern.matcher(text);
        text = matcher_d.replaceAll("$1tan");

        /*
         * rule 2a
         * 
         * [Not beginnig with: al-] Xẗ al-X => Xt al-X
         */
        Matcher matcher_a = rule2a_pattern.matcher(text);        
        text = matcher_a.replaceAll("$1t$2$3");

        /*
         * rule 2b
         * 
         * Xẗ [Not followed by: al-X] => Xh
         */
        Matcher matcher_b = rule2b_pattern.matcher(text);
        text = matcher_b.replaceAll("$1h$2");

        return text;
    }


    /*
     * rule 3
     */
    //private static Pattern rule3a_pattern = Pattern.compile(begin_or_space + "(bi|wa|ka)(\\s+)(al-)(\\S+)");
    private static Pattern rule3b_pattern = Pattern.compile("\\b(bi|wa|ka)(\\s+)(\\S+)");

    public static String ruleGroup3(String text) {

        /*
         * rule 3.A
         * 
         * P al-X; P=[ bi; wa; ka] => P-al-X
         */
        /* rule 3a is subsumed by 3b
        Matcher matcher_a = rule3a_pattern.matcher(text);
        text = matcher_a.replaceAll("$1$2-$4");
        */

        /*
         * rule 3.B
         * 
         * P X; P=[ bi; wa; ka] => P-X
         */
        Matcher matcher_b = rule3b_pattern.matcher(text);
        text = matcher_b.replaceAll("$1-$3");

        return text;
    }    

    
    /*
     * rule 4
     */
    private static Pattern rule4a_pattern = Pattern.compile("\\b(li )(\\S+)");
    private static Pattern rule4b_pattern = Pattern.compile("\\b(li al-|liʾl-|li-ʾl-|li-l-)(\\S+)");
 
    public static String ruleGroup4(String text){
        
        /*
         * rule 4.B
         * 
         * [li al-X; li’l-X; li-’l-X; li-l-X] => lil-X
         */
        Matcher matcher_b = rule4b_pattern.matcher(text);
        text = matcher_b.replaceAll("lil-$2");
        
        /*
         * rule 4.A
         * 
         * li X => li-X
         */
        Matcher matcher_a = rule4a_pattern.matcher(text);
        text = matcher_a.replaceAll("li-$2");
        
        return text;
    }
    

    /*
     * rule 5
     */
    private static Pattern rule5a_pattern = Pattern.compile("(-?ʾl-)(\\S+)");
    private static Pattern rule5b_pattern = Pattern.compile("\\b(a|A)(t-(t)|ṯ-(ṯ)|d-(d)|ḏ-(ḏ)|r-(r)|z-(z)|s-(s)|š-(š)|ṣ-(ṣ)|ḍ-(ḍ)|ṭ-(ṭ)|ẓ-(ẓ)|l-(l)|n-(n))(\\S+)");

    public static String ruleGroup5(String text){
        
        /*
         * rule 5a
         * 
         * [’l-X; X-’l-X] => al-X
         */
        Matcher matcher_a = rule5a_pattern.matcher(text);
        text = matcher_a.replaceAll(" al-$2");
        
        /*
         * rule 5b
         * 
         * aY-YX; Y=Sun letters[t;ṯ;d;ḏ;r;z;s;š;ṣ;ḍ;ṭ;ẓ;l;n] => al-YX
         */
        Matcher matcher_b = rule5b_pattern.matcher(text);
        // the groups 3-16 will be empty except the real match
        text = matcher_b.replaceAll("$1l-$3$4$5$6$7$8$9$10$11$12$13$14$15$16$17");
       
        return text;
    }
    
    /*
	 * rule 6 currently unused
	 * 
	 * λh; λ= [t; k; d; s; g] => λʹh
	 */
	private static Pattern rule6_pattern = Pattern.compile("(\\S+)(t|k|d|s|g)h(\\S+)");
	//private static Pattern rule6_pattern = Pattern.compile("([\\w|ā|ī|’|ā|š]*)(t|k|d|s|g)h([\\w|ā|ī|’|ā|š]*)");
	
    public static String ruleGroup6(String text) {
        Matcher matcher = rule6_pattern.matcher(text);
        text = matcher.replaceAll("$1$2ʹh$3");
        return text;
    }	
	
    /*
     * rule 7 currently unused
     * 
     * X[illāh; ullāh; allah; allāh; - Allāh; Allah; ullah] => X Allāh
     * 
     * [ l; b; bism]illāh => [lillāh; billāh; bismillāh] (stay unchanged)
     */
    public static String ruleGroup7(String text){
        
        String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";
        
        int count = 0;
        while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
            text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
            count++;
        }
        
        return text;
        
    }
    

    /**
     * Convert (ISMI-) transliterated arabic text into (LoC romanized) transcribed text.
     * 
     * @param text
     * @return
     */
    public static String convert(final String text) {

        if (StringUtils.isEmpty(text))
            return text;

        // make sure we have composed unicode
        String romanizedText = NormalizerUtils.unicodeNormalize(text);
        // make sure we have standard ayn and hamza
        romanizedText = NormalizerUtils.aynHamzaNormalizer(romanizedText);

        romanizedText = ruleGroup2(romanizedText);
        romanizedText = ruleGroup4(romanizedText);
        romanizedText = ruleGroup5(romanizedText);
        //romanizedText = ruleGroup6(romanizedText);
        romanizedText = ruleGroup3(romanizedText);
        // replacementText = ruleGroup7(replacementText);

        // rule 1
        for (Entry<String, String> tr : rule1_map.entrySet()) {
            if (romanizedText.contains(tr.getKey())) {
                romanizedText = romanizedText.replace(tr.getKey(), tr.getValue());
            }
        }

        return romanizedText;
    }
		
}