Mercurial > hg > openmind

package org.mpi.openmind.repository.utils;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

/**
 * See document: romanized_arabic_into_arabic.doc
 * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf
 * @author jurzua
 *
 */
public class OldRomanizationLoC {

	private static boolean debug = false;


	private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>();
	// \w = A word character: [a-zA-Z_0-9]
	private static String T = "ẗ";
	private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+";
	private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*";
	private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n";
	private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n";
	private static String begin_space = "(^|.*\\s)";
	private static String begin_space0 = "^|.*\\s";


	static{

		char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'};


		//rules 1.a to 1.f
		CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th
		CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th

		CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
		CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh

		CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
		CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh

		CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh
		CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh

		CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh
		CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh

		CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á
		//CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á

	}

	public static char APOSTROPHE = 0x27;
	public static String apostrophesNormalization(String text){
		String result = text;
		for(Character apostrophe : OldNormalizerUtils.apostrophes){
			result = result.replace(apostrophe, APOSTROPHE);
		}
		return result;
	}

	public static char a = 0x61;
	public static String aNormalization(String text){
		String result = text;
		for(Character item : OldNormalizerUtils.AList){
			result = result.replace(item, a);
		}
		return result;
	}


	public static String ruleGroup7(String text){

		String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";

		int count = 0;
		while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
			if(debug)System.out.println("ruleGroup7");
			text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
			if(debug) System.out.println(text);
			count++;
		}

		return text;

	}

	public static String ruleGroup6(String text){

		String rule_6_consonants = "t|k|d|s|g";

		if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){
			text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3");
		}

		return text;
	}

	public static String ruleGroup5(String text){

		//wa-ʾl-nahār
		//wa al-nahār
		//5A
		while(text.matches("(.*)(-ʾl-)(.*)")){
			if(debug)System.out.println("5A(a)");
			text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3");
			if(debug) System.out.println(text);
		}

		/*
		while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){
			if(debug)System.out.println("5A");
			text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4");
		}*/


		//5.B
		text = rule5B(text);

		return text;
	}

	public static String rule5B(String text){
		//'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'

		String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)";

		String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		if(matcher.find()){
			if(debug) System.out.println("5.B");
			String g1 = matcher.group(1);
			String g2 = matcher.group(2);
			String g3 = matcher.group(3);
			String g4 = matcher.group(4);
			String g5 = matcher.group(5);

			text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5;

			if(debug) System.out.println(text);
			matcher = pattern.matcher(text);
		}

		return text;
	}


	public static String ruleGroup4(String text){

		String gr_4b = "li al-|li’l-|li-’l-|li-l-";
		//4.B
		while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){
			if(debug) System.out.println("4.B");
			text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3");
		}

		//4.A
		while(text.matches(begin_space + "(li )(.*)")){
			if(debug) System.out.println("4.A");
			text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3");
		}

		return text;
	}


	public static String ruleGroup3(String text){

		//3.A
		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){
			if(debug) System.out.println("3.A");
			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5");
			//if(debug) System.out.println(text);
		}

		// 3.B
		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){
			if(debug)System.out.println("3.B");
			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4");
		}

		return text;
	}

	public static String ruleGroup2(String text){

		//2.C: al-XXXXẗ -> al-XXXXh
		while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){
			if(debug) System.out.println("2.C");
			//System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ"));
			text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5");
			if(debug) System.out.println(text);
		}

		//Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh
		if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){
			if(debug) System.out.println("2.Other");
			text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h");
			if(debug) System.out.println(text);
		}

		//2.D XXXXẗan -> XXXXtan
		while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){
			if(debug) System.out.println("2.D");
			text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4");
			if(debug) System.out.println(text);
		}

		//2A
		text = rule2A(text);

		//2B
		text = rule2B(text);

		return text;
	}

	public static String rule2B(String text){

		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		int count = 0;
		while(matcher.find() && count < 10){
			if(debug) System.out.println("2.B");
			String g1 = matcher.group(1);
			String g2 = matcher.group(2);
			String g3 = matcher.group(3);
			String g4 = matcher.group(4);
			text = g1 + "h" + g3 + g4;
			if(debug) System.out.println(text);
			matcher = pattern.matcher(text);
			count++;
		}
		return text;
	}

	public static String rule2A(String text){

		//2.A
		//String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)";
		//String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)";
		String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);

		while(matcher.find()){

			if(debug) System.out.println("2.A");

			String g1 = matcher.group(1);
			String g2 = matcher.group(2);
			String g3 = matcher.group(3);
			String g4 = matcher.group(4);
			String g5 = matcher.group(5);
			String g6 = matcher.group(6);


			//System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6);
			text = g1 + g2 + "t" + g4 + g5 + g6;
			if(debug) System.out.println(text);

			matcher = pattern.matcher(text);
		}

		return text;
	}

	public static String convert(final String text) {

		if(StringUtils.isEmpty(text))
			return text;

		String replacementText = new String(text);

		replacementText = ruleGroup2(replacementText);
		replacementText = ruleGroup4(replacementText);
		replacementText = ruleGroup6(replacementText); //6 must be executed before 5
		replacementText = ruleGroup5(replacementText);
		replacementText = ruleGroup3(replacementText);
		replacementText = ruleGroup7(replacementText);

		for (String ar : CONVERSIONMAP.keySet()) {
			String lat = CONVERSIONMAP.get(ar);
			if (replacementText.contains(ar)) {
				replacementText = replacementText.replace(ar, lat);
			}
		}

		return replacementText;
	}

	public static void test(String s){
		System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n");

	}


	public static void main(String[] args){


//		test("li’l-Shirbīnī");
//		test("li-'l-Shirbīnī");
//		test("’Abdullāh");
//		test("’Abd allāh");

		//test("ʿAli b. ʿAbdullah");
		//test("ʿAbdullah");
		//test("Risālaẗ");
		//test("Risālaẗ fī");
		//test("Risālaẗ fī qismaẗ");
		//test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ");

		//test("ʿAli b. ʿAbdullah");
		//test("Yusuf b. ʿAbdullah");


		//test("fī-'l-kitāb");

		//test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm");
		/*
		//Rules Group 2
		test("al-risalaẗ");
		test("risalaẗ al-kabir");
		test("risalaẗ    al-kabir");
		test("risalaẗ al-kabiraẗ"); // ?????
		test("risalaẗ");
		test("risalaẗan");
		test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ");

		//Rules Group 3
		test("bi al-tamām̄");
		test("wa al-kamāl");
		test("bi tarīq");

		//Group 4
		test("li al-shirbini");
		test("li’l-Shirbīnī");
		test("li-’l-Shirbīnī");
		test("li tajrīd");

		//Group 5
		test("aš-šams");
		test("aḏ-ḏams");
        test("fi’l-kitāb");
        test("fi-’l-kitāb");


		//Group 6
		test("Adham");
        test("shirbini");
        test("shirazi");

		//Group 7
		test("’Abdullāh");
        test("ʿAbdullah");
		test("’Abd allāh");

        test("ʿAli b. ʿAbdullah");


		*/
		//test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");


		//test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad");
		//test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ");

		//char ch = 'Á';
		//System.out.println(String.format("%04x", (int) ch));
		test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab");

		//test("wa-ʾl-nahār");
		//test("li-l-ʿIlm");
		//test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
		//test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");

		//String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ";

		//test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār");


		//test("al-ʿAqīda as-silālajīya dfsdssdf");
		//test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā");


		/*
		String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār";
		String regex =  begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);

		while(matcher.find()){

			if(debug) System.out.println("2.A");
		}
		*/
		/*
		//String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);

		while(matcher.find()){
			System.out.println(matcher.groupCount());

			System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4));
		}
		*/
	}

}
author	Robert Casties <casties@mpiwg-berlin.mpg.de>
date	Mon, 26 Feb 2018 14:39:49 +0100
parents	ad505ef703ed
children