changeset 88:ad505ef703ed

new implementation of translit-to-romanization rules in RomanizationLoc.
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Fri, 23 Feb 2018 21:43:29 +0100
parents 8005f7011975
children 8adfa8679991
files src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java src/main/java/org/mpi/openmind/repository/utils/OldRomanizationLoC.java src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java
diffstat 3 files changed, 699 insertions(+), 400 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java	Tue Feb 06 15:14:29 2018 +0100
+++ b/src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java	Fri Feb 23 21:43:29 2018 +0100
@@ -1,8 +1,17 @@
 package org.mpi.openmind.repository.utils;
 
+import java.text.Normalizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 public class NormalizerUtils {
-
 	
+	/**
+	 * Returns String normalized for searching arabic or transliterated arabic.
+	 * 
+	 * @param w
+	 * @return
+	 */
 	public static String normalize(String w) {
 		String atn = ArabicTranslitNormalizer.normalize(w);
 		String an = ArabicNormalizer.normalize(atn);
@@ -10,7 +19,7 @@
 	}
 	
     /**
-     * Returns String normalized according to arabic transliteration rules.
+     * Returns String normalized for searching arabic transliteration text.
      * 
      * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit
      * 
@@ -21,7 +30,47 @@
 	    return ArabicTranslitNormalizer.normalize(w);		
 	}
 	
+	/**
+	 * Returns String normalized for searching arabic.
+	 * 
+	 * The normalization consists in removing vowels and other diacritic marks.
+	 * 
+	 * @param w
+	 * @return
+	 */
 	public static String normalizeArabic(String w) {
 		return ArabicNormalizer.normalize(w);
 	}
+	
+	/**
+	 * Returns String in Unicode normalization (NFC).
+	 * 
+	 * @param text
+	 * @return
+	 */
+	public static String unicodeNormalize(String text) {
+	    if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
+	        Normalizer.normalize(text, Normalizer.Form.NFC);
+	    }
+	    return text;
+	}
+	
+	private static Pattern old_ayn_pattern = Pattern.compile("(\u2018|\u02BB)"); // ‘|ʻ
+	private static String new_ayn = "\u02BF"; // ʿ
+    private static Pattern old_hamza_pattern = Pattern.compile("(\u2019|\u02bc)"); // ’|ʼ
+    private static String new_hamza = "\u02BE"; // ʾ
+	
+	/**
+	 * Normalize transliteration forms for ayn and hamza.
+	 * 
+	 * @param text
+	 * @return
+	 */
+	public static String aynHamzaNormalizer(String text) {
+	    Matcher match_ayn = old_ayn_pattern.matcher(text);
+	    text = match_ayn.replaceAll(new_ayn);	    
+        Matcher match_hamza = old_hamza_pattern.matcher(text);
+        text = match_hamza.replaceAll(new_hamza);      
+	    return text;
+	}
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/mpi/openmind/repository/utils/OldRomanizationLoC.java	Fri Feb 23 21:43:29 2018 +0100
@@ -0,0 +1,419 @@
+package org.mpi.openmind.repository.utils;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * See document: romanized_arabic_into_arabic.doc
+ * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf
+ * @author jurzua
+ *
+ */
+public class OldRomanizationLoC {
+
+	private static boolean debug = false;
+	
+	
+	private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>();
+	// \w = A word character: [a-zA-Z_0-9]
+	private static String T = "ẗ";
+	private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+";
+	private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*";
+	private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n";
+	private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n";
+	private static String begin_space = "(^|.*\\s)";
+	private static String begin_space0 = "^|.*\\s";
+	
+	
+	static{
+		
+		char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'};
+		
+		
+		//rules 1.a to 1.f
+		CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th
+		CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th
+		
+		CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
+		CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh
+		
+		CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
+		CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh
+		
+		CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh
+		CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh
+		
+		CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh
+		CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh
+		
+		CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á
+		//CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á
+
+	}
+	
+	public static char APOSTROPHE = 0x27;
+	public static String apostrophesNormalization(String text){
+		String result = text;
+		for(Character apostrophe : OldNormalizerUtils.apostrophes){
+			result = result.replace(apostrophe, APOSTROPHE);
+		}
+		return result;
+	}
+	
+	public static char a = 0x61;
+	public static String aNormalization(String text){
+		String result = text;
+		for(Character item : OldNormalizerUtils.AList){
+			result = result.replace(item, a);
+		}
+		return result;
+	}
+	
+	
+	public static String ruleGroup7(String text){
+		
+		String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";
+		
+		int count = 0;
+		while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
+			if(debug)System.out.println("ruleGroup7");
+			text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
+			if(debug) System.out.println(text);
+			count++;
+		}
+		
+		return text;
+		
+	}
+	
+	public static String ruleGroup6(String text){
+		
+		String rule_6_consonants = "t|k|d|s|g";
+		
+		if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){
+			text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3");
+		}
+		
+		return text;
+	}
+	
+	public static String ruleGroup5(String text){
+		
+		//wa-ʾl-nahār
+		//wa al-nahār
+		//5A
+		while(text.matches("(.*)(-ʾl-)(.*)")){
+			if(debug)System.out.println("5A(a)");
+			text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3");
+			if(debug) System.out.println(text);
+		}
+		
+		/*
+		while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){
+			if(debug)System.out.println("5A");
+			text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4");
+		}*/
+		
+		
+		//5.B
+		text = rule5B(text);
+		
+		return text;
+	}
+	
+	public static String rule5B(String text){
+		//'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'
+		
+		String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)";
+		
+		String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)";
+		Pattern pattern = Pattern.compile(regex); 
+		Matcher matcher = pattern.matcher(text);
+		if(matcher.find()){
+			if(debug) System.out.println("5.B");
+			String g1 = matcher.group(1);
+			String g2 = matcher.group(2);
+			String g3 = matcher.group(3);
+			String g4 = matcher.group(4);
+			String g5 = matcher.group(5);
+			
+			text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5;
+			
+			if(debug) System.out.println(text);
+			matcher = pattern.matcher(text);
+		}
+		
+		return text;
+	}
+	
+	
+	public static String ruleGroup4(String text){
+		
+		String gr_4b = "li al-|li’l-|li-’l-|li-l-";
+		//4.B
+		while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){
+			if(debug) System.out.println("4.B");
+			text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3");
+		}
+		
+		//4.A
+		while(text.matches(begin_space + "(li )(.*)")){
+			if(debug) System.out.println("4.A");
+			text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3");
+		}
+		
+		return text;
+	}
+	
+	
+	public static String ruleGroup3(String text){
+		
+		//3.A
+		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){
+			if(debug) System.out.println("3.A");
+			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5");
+			//if(debug) System.out.println(text);
+		} 
+		
+		// 3.B
+		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){
+			if(debug)System.out.println("3.B");
+			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4");
+		} 
+		
+		return text;
+	}
+	
+	public static String ruleGroup2(String text){
+		
+		//2.C: al-XXXXẗ -> al-XXXXh
+		while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){
+			if(debug) System.out.println("2.C");
+			//System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ"));
+			text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5");
+			if(debug) System.out.println(text);
+		} 
+		
+		//Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh
+		if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){
+			if(debug) System.out.println("2.Other");
+			text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h");
+			if(debug) System.out.println(text);
+		}
+		
+		//2.D XXXXẗan -> XXXXtan
+		while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){
+			if(debug) System.out.println("2.D");
+			text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4");	
+			if(debug) System.out.println(text);
+		}
+		
+		//2A
+		text = rule2A(text);
+		
+		//2B
+		text = rule2B(text);
+		
+		return text;
+	}
+	
+	public static String rule2B(String text){
+		
+		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
+		Pattern pattern = Pattern.compile(regex);
+		Matcher matcher = pattern.matcher(text);
+		int count = 0;
+		while(matcher.find() && count < 10){
+			if(debug) System.out.println("2.B");
+			String g1 = matcher.group(1);
+			String g2 = matcher.group(2);
+			String g3 = matcher.group(3);
+			String g4 = matcher.group(4);
+			text = g1 + "h" + g3 + g4;
+			if(debug) System.out.println(text);
+			matcher = pattern.matcher(text);
+			count++;
+		}
+		return text;
+	}
+	
+	public static String rule2A(String text){
+		
+		//2.A
+		//String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)";
+		//String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)";
+		String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
+		Pattern pattern = Pattern.compile(regex);
+		Matcher matcher = pattern.matcher(text);
+				
+		while(matcher.find()){
+			
+			if(debug) System.out.println("2.A");
+			
+			String g1 = matcher.group(1);
+			String g2 = matcher.group(2);
+			String g3 = matcher.group(3);
+			String g4 = matcher.group(4);
+			String g5 = matcher.group(5);
+			String g6 = matcher.group(6);
+			
+			
+			//System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6);
+			text = g1 + g2 + "t" + g4 + g5 + g6;
+			if(debug) System.out.println(text);
+			
+			matcher = pattern.matcher(text);
+		}
+		
+		return text;
+	}
+	
+	public static String convert(final String text) {
+		
+		if(StringUtils.isEmpty(text))
+			return text;
+		
+		String replacementText = new String(text);
+		
+		replacementText = ruleGroup2(replacementText);
+		replacementText = ruleGroup4(replacementText);
+		replacementText = ruleGroup6(replacementText); //6 must be executed before 5
+		replacementText = ruleGroup5(replacementText);
+		replacementText = ruleGroup3(replacementText);
+		replacementText = ruleGroup7(replacementText);
+		
+		for (String ar : CONVERSIONMAP.keySet()) {
+			String lat = CONVERSIONMAP.get(ar);
+			if (replacementText.contains(ar)) {
+				replacementText = replacementText.replace(ar, lat);
+			}			
+		}
+		
+		return replacementText;
+	}
+	
+	public static void test(String s){
+		System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n");
+		
+	}
+	
+
+	public static void main(String[] args){
+		
+		
+//		test("li’l-Shirbīnī");
+//		test("li-'l-Shirbīnī");
+//		test("’Abdullāh");
+//		test("’Abd allāh");
+		
+		//test("ʿAli b. ʿAbdullah");
+		//test("ʿAbdullah");
+		//test("Risālaẗ");
+		//test("Risālaẗ fī");
+		//test("Risālaẗ fī qismaẗ");
+		//test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ");
+		
+		//test("ʿAli b. ʿAbdullah");
+		//test("Yusuf b. ʿAbdullah");
+		
+		
+		
+		//test("fī-'l-kitāb");
+		
+		//test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm");
+		/*
+		//Rules Group 2
+		test("al-risalaẗ");
+		test("risalaẗ al-kabir");
+		test("risalaẗ    al-kabir");
+		test("risalaẗ al-kabiraẗ"); // ?????
+		test("risalaẗ");
+		test("risalaẗan");
+		test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ");
+		
+		//Rules Group 3
+		test("bi al-tamām̄");
+		test("wa al-kamāl");
+		test("bi tarīq");
+*/
+		//Group 4
+		test("li al-shirbini");
+		test("li’l-Shirbīnī");
+		test("li-’l-Shirbīnī");
+		test("li tajrīd");
+/*		
+		//Group 5
+		test("aš-šams");
+		test("aḏ-ḏams");
+        test("fi’l-kitāb");
+        test("fi-’l-kitāb");
+		*/
+		
+		
+		//Group 6
+		test("Adham");
+        test("shirbini");
+        test("shirazi");
+
+		/*
+		//Group 7
+		test("’Abd allāh");
+		
+		*/
+		
+		//test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
+		
+		
+		//test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad");
+		//test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ");
+		
+		//char ch = 'Á';
+		//System.out.println(String.format("%04x", (int) ch));
+		//test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab");
+	
+		//test("wa-ʾl-nahār");
+		//test("li-l-ʿIlm");
+		//test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
+		//test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
+		
+		//String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ";
+		
+		//test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār");
+		
+		
+		//test("al-ʿAqīda as-silālajīya dfsdssdf");
+		//test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā");
+		
+		
+		
+		
+		/*
+		String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār";
+		String regex =  begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
+		Pattern pattern = Pattern.compile(regex);
+		Matcher matcher = pattern.matcher(text);
+				
+		while(matcher.find()){
+			
+			if(debug) System.out.println("2.A");
+		}
+		*/
+		/*
+		//String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
+		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
+		Pattern pattern = Pattern.compile(regex);
+		Matcher matcher = pattern.matcher(text);
+		
+		while(matcher.find()){
+			System.out.println(matcher.groupCount());
+			
+			System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4));
+		}
+		*/
+	}
+	
+}
--- a/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java	Tue Feb 06 15:14:29 2018 +0100
+++ b/src/main/java/org/mpi/openmind/repository/utils/RomanizationLoC.java	Fri Feb 23 21:43:29 2018 +0100
@@ -2,417 +2,248 @@
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
 
 /**
- * See document: romanized_arabic_into_arabic.doc
+ * Convert ISMI transliteration into LOC romanization/transcription.
+ * 
+ * See document: translit-to-romanization-2.0.doc by Chantal Wahbi
  * http://www.loc.gov/catdir/cpso/romanization/arabic.pdf
- * @author jurzua
+ * 
+ * @author cwahbi, jurzua, casties
  *
  */
 public class RomanizationLoC {
 
-	private static boolean debug = false;
-	
-	
-	private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>();
-	// \w = A word character: [a-zA-Z_0-9]
-	private static String T = "ẗ";
-	private static String regex_words = "[\\w|ā|ī|’|ā|š|ṭ|ẗ]+";
-	private static String regex_words_empty = "[\\w|ā|ī|’|ā|š]*";
-	private static String rule_5_a_init = "at-t|aṯ-ṯ|ad-d|aḏ-ḏ|ar-r|az-z|as-s|aš-š|aṣ-ṣ|aḍ-ḍ|aṭ-ṭ|aẓ-ẓ|al-l|an-n";
-	private static String rule_5_a_letters = "t|ṯ|d|ḏ|r|z|s|š|ṣ|ḍ|ṭ|ẓ|l|n";
-	private static String begin_space = "(^|.*\\s)";
-	private static String begin_space0 = "^|.*\\s";
-	
-	
-	static{
-		
-		char[] aaa = {'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'};
-		
-		
-		//rules 1.a to 1.f
-		CONVERSIONMAP.put("\u1E6F", "\u0074\u0068");//ṯ -> th
-		CONVERSIONMAP.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th
-		
-		CONVERSIONMAP.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
-		CONVERSIONMAP.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh
-		
-		CONVERSIONMAP.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
-		CONVERSIONMAP.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh
-		
-		CONVERSIONMAP.put("\u0161", "\u0073\u0068");//š -> sh
-		CONVERSIONMAP.put("\u0160", "\u0053\u0068");//Š -> Sh
-		
-		CONVERSIONMAP.put("\u0121", "\u0067\u0068");//ġ -> gh
-		CONVERSIONMAP.put("\u0120", "\u0047\u0068");//Ġ -> Gh
-		
-		CONVERSIONMAP.put("\u1EF3", "\u00E1");//ỳ -> á
-		//CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á
+	/*
+	 * rule 1
+	 */
+    private static Map<String, String> rule1_map = new HashMap<String, String>();
+    static{
+        //rules 1.a to 1.f
+        rule1_map.put("\u1E6F", "\u0074\u0068");//ṯ -> th
+        rule1_map.put("\u1E6E", "\u0054\u0068");//Ṯ -> Th
+        
+        rule1_map.put("\u1E2B", "\u006B\u0068");//ḫ -> kh
+        rule1_map.put("\u1E2A", "\u004B\u0068");//Ḫ -> Kh
+        
+        rule1_map.put("\u1E0F", "\u0064\u0068");//ḏ -> dh
+        rule1_map.put("\u1E0E", "\u0044\u0068");//Ḏ -> Dh
+        
+        rule1_map.put("\u0161", "\u0073\u0068");//š -> sh
+        rule1_map.put("\u0160", "\u0053\u0068");//Š -> Sh
+        
+        rule1_map.put("\u0121", "\u0067\u0068");//ġ -> gh
+        rule1_map.put("\u0120", "\u0047\u0068");//Ġ -> Gh
+        
+        rule1_map.put("\u1EF3", "\u00E1");//ỳ -> á
+        //CONVERSIONMAP.put("\u1EF2", "\u00C1");//Ỳ -> Á
+    }
+    
+
+	/*
+	 * rule 2
+	 */
+    private static Pattern rule2a_pattern = Pattern.compile("\\b((?!al-)\\S+)ẗ(\\s+)(al-)");
+    private static Pattern rule2b_pattern = Pattern.compile("(\\S+)ẗ(\\s+|(?!al-)\\S*)");
+    private static Pattern rule2c_pattern = Pattern.compile("\\b(al-)(\\S+)ẗ\\b");
+    private static Pattern rule2d_pattern = Pattern.compile("(\\S+)ẗan\\b");
+    
+    public static String ruleGroup2(String text) {
+
+        /*
+         * Rule 2c 
+         * 
+         * al-Xẗ => al-Xh
+         */
+        Matcher matcher_c = rule2c_pattern.matcher(text);
+        text = matcher_c.replaceAll("$1$2h");
+
+        /*
+         * rule 2.d
+         * 
+         * Xẗan -> Xtan
+         */
+        Matcher matcher_d = rule2d_pattern.matcher(text);
+        text = matcher_d.replaceAll("$1tan");
+
+        /*
+         * rule 2a
+         * 
+         * [Not beginnig with: al-] Xẗ al-X => Xt al-X
+         */
+        Matcher matcher_a = rule2a_pattern.matcher(text);        
+        text = matcher_a.replaceAll("$1t$2$3");
+
+        /*
+         * rule 2b
+         * 
+         * Xẗ [Not followed by: al-X] => Xh
+         */
+        Matcher matcher_b = rule2b_pattern.matcher(text);
+        text = matcher_b.replaceAll("$1h$2");
+
+        return text;
+    }
+
+
+    /*
+     * rule 3
+     */
+    //private static Pattern rule3a_pattern = Pattern.compile(begin_or_space + "(bi|wa|ka)(\\s+)(al-)(\\S+)");
+    private static Pattern rule3b_pattern = Pattern.compile("\\b(bi|wa|ka)(\\s+)(\\S+)");
+
+    public static String ruleGroup3(String text) {
 
-	}
-	
-	public static char APOSTROPHE = 0x27;
-	public static String apostrophesNormalization(String text){
-		String result = text;
-		for(Character apostrophe : OldNormalizerUtils.apostrophes){
-			result = result.replace(apostrophe, APOSTROPHE);
-		}
-		return result;
-	}
-	
-	public static char a = 0x61;
-	public static String aNormalization(String text){
-		String result = text;
-		for(Character item : OldNormalizerUtils.AList){
-			result = result.replace(item, a);
-		}
-		return result;
-	}
-	
-	
-	public static String ruleGroup7(String text){
-		
-		String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";
-		
-		int count = 0;
-		while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
-			if(debug)System.out.println("ruleGroup7");
-			text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
-			if(debug) System.out.println(text);
-			count++;
-		}
-		
-		return text;
-		
-	}
-	
-	public static String ruleGroup6(String text){
-		
-		String rule_6_consonants = "t|k|d|s|g";
-		
-		if(text.matches("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")")){
-			text = text.replaceAll("("+regex_words_empty+")("+ rule_6_consonants + ")h("+regex_words_empty+")", "$1$2’h$3");
-		}
-		
-		return text;
-	}
+        /*
+         * rule 3.A
+         * 
+         * P al-X; P=[ bi; wa; ka] => P-al-X
+         */
+        /* rule 3a is subsumed by 3b
+        Matcher matcher_a = rule3a_pattern.matcher(text);
+        text = matcher_a.replaceAll("$1$2-$4");
+        */
+
+        /*
+         * rule 3.B
+         * 
+         * P X; P=[ bi; wa; ka] => P-X
+         */
+        Matcher matcher_b = rule3b_pattern.matcher(text);
+        text = matcher_b.replaceAll("$1-$3");
+
+        return text;
+    }    
+
+    
+    /*
+     * rule 4
+     */
+    private static Pattern rule4a_pattern = Pattern.compile("\\b(li )(\\S+)");
+    private static Pattern rule4b_pattern = Pattern.compile("\\b(li al-|liʾl-|li-ʾl-|li-l-)(\\S+)");
+ 
+    public static String ruleGroup4(String text){
+        
+        /*
+         * rule 4.B
+         * 
+         * [li al-X; li’l-X; li-’l-X; li-l-X] => lil-X
+         */
+        Matcher matcher_b = rule4b_pattern.matcher(text);
+        text = matcher_b.replaceAll("lil-$2");
+        
+        /*
+         * rule 4.A
+         * 
+         * li X => li-X
+         */
+        Matcher matcher_a = rule4a_pattern.matcher(text);
+        text = matcher_a.replaceAll("li-$2");
+        
+        return text;
+    }
+    
+
+    /*
+     * rule 5
+     */
+    private static Pattern rule5a_pattern = Pattern.compile("(-?ʾl-)(\\S+)");
+    private static Pattern rule5b_pattern = Pattern.compile("\\b(a|A)(t-(t)|ṯ-(ṯ)|d-(d)|ḏ-(ḏ)|r-(r)|z-(z)|s-(s)|š-(š)|ṣ-(ṣ)|ḍ-(ḍ)|ṭ-(ṭ)|ẓ-(ẓ)|l-(l)|n-(n))(\\S+)");
+
+    public static String ruleGroup5(String text){
+        
+        /*
+         * rule 5a
+         * 
+         * [’l-X; X-’l-X] => al-X
+         */
+        Matcher matcher_a = rule5a_pattern.matcher(text);
+        text = matcher_a.replaceAll(" al-$2");
+        
+        /*
+         * rule 5b
+         * 
+         * aY-YX; Y=Sun letters[t;ṯ;d;ḏ;r;z;s;š;ṣ;ḍ;ṭ;ẓ;l;n] => al-YX
+         */
+        Matcher matcher_b = rule5b_pattern.matcher(text);
+        // the groups 3-16 will be empty except the real match
+        text = matcher_b.replaceAll("$1l-$3$4$5$6$7$8$9$10$11$12$13$14$15$16$17");
+       
+        return text;
+    }
+    
+    /*
+	 * rule 6
+	 * 
+	 * λh; λ= [t; k; d; s; g] => λʹh
+	 */
+	private static Pattern rule6_pattern = Pattern.compile("(\\S+)(t|k|d|s|g)h(\\S+)");
+	//private static Pattern rule6_pattern = Pattern.compile("([\\w|ā|ī|’|ā|š]*)(t|k|d|s|g)h([\\w|ā|ī|’|ā|š]*)");
 	
-	public static String ruleGroup5(String text){
-		
-		//wa-ʾl-nahār
-		//wa al-nahār
-		//5A
-		while(text.matches("(.*)(-ʾl-)(.*)")){
-			if(debug)System.out.println("5A(a)");
-			text = text.replaceAll("(.*)(-ʾl-)(.*)", "$1 al-$3");
-			if(debug) System.out.println(text);
-		}
-		
-		/*
-		while(text.matches("(.*)(" + begin_space0 + ")(ʾl-)(.*)")){
-			if(debug)System.out.println("5A");
-			text = text.replaceAll("(.*)(" + begin_space0 + ")(ʾl-)(.*)", "$1$2al-$4");
-		}*/
-		
-		
-		//5.B
-		text = rule5B(text);
-		
-		return text;
-	}
-	
-	public static String rule5B(String text){
-		//'t', 'ṯ', 'd', 'ḏ', 'r', 'z', 's', 'š', 'ṣ', 'ḍ', 'ṭ', 'ẓ', 'l', 'n'
-		
-		String regex0 = "(t-t|ṯ-ṯ|d-d|ḏ-ḏ|r-r|z-z|s-s|š-š|ṣ-ṣ|ḍ-ḍ|ṭ-ṭ|ẓ-ẓ|l-l|n-n)";
-		
-		String regex = begin_space + "(a|A)" + regex0 + "(\\S+)(.*)";
-		Pattern pattern = Pattern.compile(regex); 
-		Matcher matcher = pattern.matcher(text);
-		if(matcher.find()){
-			if(debug) System.out.println("5.B");
-			String g1 = matcher.group(1);
-			String g2 = matcher.group(2);
-			String g3 = matcher.group(3);
-			String g4 = matcher.group(4);
-			String g5 = matcher.group(5);
-			
-			text = g1 + g2 + "l-" + g3.charAt(0) + g4 + g5;
-			
-			if(debug) System.out.println(text);
-			matcher = pattern.matcher(text);
-		}
-		
-		return text;
-	}
-	
+    public static String ruleGroup6(String text) {
+        Matcher matcher = rule6_pattern.matcher(text);
+        text = matcher.replaceAll("$1$2ʹh$3");
+        return text;
+    }	
 	
-	public static String ruleGroup4(String text){
-		
-		String gr_4b = "li al-|li’l-|li-’l-|li-l-";
-		//4.B
-		while(text.matches(begin_space + "(" + gr_4b + ")(.*)")){
-			if(debug) System.out.println("4.B");
-			text = text.replaceAll(begin_space + "(" + gr_4b + ")(.*)", "$1lil-$3");
-		}
-		
-		//4.A
-		while(text.matches(begin_space + "(li )(.*)")){
-			if(debug) System.out.println("4.A");
-			text = text.replaceAll(begin_space + "(li )(.*)", "$1li-$3");
-		}
-		
-		return text;
-	}
-	
-	
-	public static String ruleGroup3(String text){
-		
-		//3.A
-		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)")){
-			if(debug) System.out.println("3.A");
-			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(al-)(.*)", "$1$2-$4$5");
-			//if(debug) System.out.println(text);
-		} 
-		
-		// 3.B
-		while(text.matches(begin_space + "(bi|wa|ka)(\\s+)(.*)")){
-			if(debug)System.out.println("3.B");
-			text = text.replaceAll(begin_space + "(bi|wa|ka)(\\s+)(.*)", "$1$2-$4");
-		} 
-		
-		return text;
-	}
-	
-	public static String ruleGroup2(String text){
-		
-		//2.C: al-XXXXẗ -> al-XXXXh
-		while(text.matches(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)")){
-			if(debug) System.out.println("2.C");
-			//System.out.println(text.replaceAll(begin_space + "(al-)(" + regex_words + ")ẗ(.*)", "$2$3ẗ"));
-			text = text.replaceAll(begin_space + "(al-)(\\S+)ẗ(\\s+|$)(.*)", "$1$2$3h$4$5");
-			if(debug) System.out.println(text);
-		} 
-		
-		//Other XXXXẗ al-XXXXẗ -> XXXXt al-XXXXh
-		if(text.matches("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)")){
-			if(debug) System.out.println("2.Other");
-			text = text.replaceAll("(" + regex_words + ")(ẗ)(\\s*)(al-)(" + regex_words + ")(ẗ)", "$1t al-$5h");
-			if(debug) System.out.println(text);
-		}
-		
-		//2.D XXXXẗan -> XXXXtan
-		while(text.matches("(.*)(ẗan)(\\s+|$)(.*)")){
-			if(debug) System.out.println("2.D");
-			text = text.replaceAll("(.*)(ẗan)(\\s+|$)(.*)", "$1tan$3$4");	
-			if(debug) System.out.println(text);
-		}
-		
-		//2A
-		text = rule2A(text);
+    /*
+     * rule 7 currently unused
+     * 
+     * X[illāh; ullāh; allah; allāh; - Allāh; Allah; ullah] => X Allāh
+     * 
+     * [ l; b; bism]illāh => [lillāh; billāh; bismillāh] (stay unchanged)
+     */
+    public static String ruleGroup7(String text){
+        
+        String rule_7_1_allah = "illāh|ullāh|allah|allāh|-Allāh|Allah|ullah|illah";
+        
+        int count = 0;
+        while(text.matches("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)") && count<10){
+            text = text.replaceAll("(.*)(\\S+)(" + rule_7_1_allah + ")(.*)", "$1$2 Allāh$4");
+            count++;
+        }
+        
+        return text;
+        
+    }
+    
+
+    /**
+     * Convert (ISMI-) transliterated arabic text into (LoC romanized) transcribed text.
+     * 
+     * @param text
+     * @return
+     */
+    public static String convert(final String text) {
+
+        if (StringUtils.isEmpty(text))
+            return text;
+
+        // make sure we have composed unicode
+        String romanizedText = NormalizerUtils.unicodeNormalize(text);
+        // make sure we have standard ayn and hamza
+        romanizedText = NormalizerUtils.aynHamzaNormalizer(romanizedText);
+
+        romanizedText = ruleGroup2(romanizedText);
+        romanizedText = ruleGroup4(romanizedText);
+        romanizedText = ruleGroup5(romanizedText);
+        //romanizedText = ruleGroup6(romanizedText);
+        romanizedText = ruleGroup3(romanizedText);
+        // replacementText = ruleGroup7(replacementText);
+
+        // rule 1
+        for (Entry<String, String> tr : rule1_map.entrySet()) {
+            if (romanizedText.contains(tr.getKey())) {
+                romanizedText = romanizedText.replace(tr.getKey(), tr.getValue());
+            }
+        }
+
+        return romanizedText;
+    }
 		
-		//2B
-		text = rule2B(text);
-		
-		return text;
-	}
-	
-	public static String rule2B(String text){
-		
-		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
-		Pattern pattern = Pattern.compile(regex);
-		Matcher matcher = pattern.matcher(text);
-		int count = 0;
-		while(matcher.find() && count < 10){
-			if(debug) System.out.println("2.B");
-			String g1 = matcher.group(1);
-			String g2 = matcher.group(2);
-			String g3 = matcher.group(3);
-			String g4 = matcher.group(4);
-			text = g1 + "h" + g3 + g4;
-			if(debug) System.out.println(text);
-			matcher = pattern.matcher(text);
-			count++;
-		}
-		return text;
-	}
-	
-	public static String rule2A(String text){
-		
-		//2.A
-		//String regex2A = "(.*)(\\s++)(.*)ẗ(\\s++)(al-)(.*)";
-		//String regex2A = "(.*)(\\s++)(?<!(al-))(.*)ẗ(\\s++)(al-)(.*)";
-		String regex = begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
-		Pattern pattern = Pattern.compile(regex);
-		Matcher matcher = pattern.matcher(text);
-				
-		while(matcher.find()){
-			
-			if(debug) System.out.println("2.A");
-			
-			String g1 = matcher.group(1);
-			String g2 = matcher.group(2);
-			String g3 = matcher.group(3);
-			String g4 = matcher.group(4);
-			String g5 = matcher.group(5);
-			String g6 = matcher.group(6);
-			
-			
-			//System.out.println(g1 +" # "+ g2 + " #3 " + g3 + " #4 " + g4 + " #5 " + g5 + " # " + g6);
-			text = g1 + g2 + "t" + g4 + g5 + g6;
-			if(debug) System.out.println(text);
-			
-			matcher = pattern.matcher(text);
-		}
-		
-		return text;
-	}
-	
-	public static String convert(final String text) {
-		
-		if(StringUtils.isEmpty(text))
-			return text;
-		
-		String replacementText = new String(text);
-		
-		replacementText = ruleGroup2(replacementText);
-		replacementText = ruleGroup4(replacementText);
-		replacementText = ruleGroup6(replacementText); //6 must be executed before 5
-		replacementText = ruleGroup5(replacementText);
-		replacementText = ruleGroup3(replacementText);
-		replacementText = ruleGroup7(replacementText);
-		
-		for (String ar : CONVERSIONMAP.keySet()) {
-			String lat = CONVERSIONMAP.get(ar);
-			if (replacementText.contains(ar)) {
-				replacementText = replacementText.replace(ar, lat);
-			}			
-		}
-		
-		return replacementText;
-	}
-	
-	public static void test(String s){
-		System.out.println("--------------\n" + s + " ->\n" + convert(s) + "\n");
-		
-	}
-	
-
-	public static void main(String[] args){
-		
-		
-		//test("li’l-Shirbīnī");
-		//test("li-'l-Shirbīnī");
-		//test("’Abdullāh");
-		//test("’Abd allāh");
-		
-		//test("ʿAli b. ʿAbdullah");
-		//test("ʿAbdullah");
-		//test("Risālaẗ");
-		//test("Risālaẗ fī");
-		//test("Risālaẗ fī qismaẗ");
-		//test("Risālaẗ fī qismaẗ al-handasaẗ al-qabbān bi ṭarīq al-handasaẗ bi ṭarīq wa'l-misāḥaẗ wa'l-ḥisāb bi'l-nisab al-arbaʿ");
-		
-		//test("ʿAli b. ʿAbdullah");
-		//test("Yusuf b. ʿAbdullah");
-		
-		
-		
-		//test("fī-'l-kitāb");
-		
-		//test("Risālaẗ (Nukat) fīmā yaṣiḥḥu min aḥkām al-nujūm = Kitāb al-taḏākīr (Risālaẗ) fī ibṭāl aḥkām al-nujūm");
-		
-		/*
-		//Rules Group 2
-		test("al-risalaẗ");
-		test("risalaẗ al-kabir");
-		test("risalaẗ    al-kabir");
-		test("risalaẗ al-kabiraẗ"); // ?????
-		test("risalaẗ");
-		test("risalaẗan");
-		test("Risālaẗ fī al-ʿamal bi-rubʿ al-muqanṭarāt al-šamālīyaẗ");
-				
-		//Rules Group 3
-		test("bi al-tamām̄");
-		test("wa al-kamāl");
-		test("bi tarīq");
-				
-		//Group 4
-		test("li al-shirbini");
-		test("li’l-Shirbīnī");
-		test("li-’l-Shirbīnī");
-		test("li tajrīd");
-		
-		
-		//Group 5
-		test("aš-šams");
-		test("aḏ-ḏams");
-		
-		
-		
-		//Group 6
-		test("Adham");
-		
-		//Group 7
-		test("’Abd allāh");
-		
-		test("fi’l-kitāb");
-		test("fi-’l-kitāb");
-		*/
-		
-		
-		
-		
-		//test("al-Abyāt fī al-Ṭāliʿ wa al-Ġārib wa al-Mutawassiṭ wa al-Watad");
-		//test("Al-tuḥfaẗ al-šāhiyyaẗ fī al-āḥkām al-falakiyyaẗ");
-		
-		//char ch = 'Á';
-		//System.out.println(String.format("%04x", (int) ch));
-		//test("Al-Futūḥāt al-Wahbīyaẗ fī Ỳarḥ al-Risālaẗ al-Fatḥīyaẗ fī al-ʿamal bi-al-rubʿ al-mujayyab");
-	
-		//test("wa-ʾl-nahār");
-		//test("li-l-ʿIlm");
-		//test("al-Jawharaẗ al-bahiyyaẗ fī maʿrifaẗ al-awqāt fī maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
-		//test("al-Jawharaẗ al-bahiyyaẗ fī al-maʿrifaẗ al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ");
-		
-		//String text = "fī maʿrifaẗan al-awqāt al-layliyyaẗ wa-ʾl-nahāriyyaẗ";
-		
-		//test("Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār");
-		
-		
-		//test("al-ʿAqīda as-silālajīya dfsdssdf");
-		test("Muḫtaṣaraẗ fī ṣanʿaẗ baʿḍ al-ālāt al-raṣadiyyaẗ wa-ʾl-ʿamal bi-hā");
-		
-		
-		
-		
-		/*
-		String text = "Natījaẗ al-afkār fī aʿmāl al-layl wa-ʾl-nahār";
-		String regex =  begin_space + "((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
-		Pattern pattern = Pattern.compile(regex);
-		Matcher matcher = pattern.matcher(text);
-				
-		while(matcher.find()){
-			
-			if(debug) System.out.println("2.A");
-		}
-		*/
-		/*
-		//String regex = "(.*)(\\s+)((?!al-)\\S+)(ẗ)(\\s+)(al-)(.*)";
-		String regex = "(.*)(ẗ)(\\s+|(?!al-)\\S*)(.*)";
-		Pattern pattern = Pattern.compile(regex);
-		Matcher matcher = pattern.matcher(text);
-		
-		while(matcher.find()){
-			System.out.println(matcher.groupCount());
-			
-			System.out.println(matcher.group(1) + " # " + matcher.group(2) + " # " + matcher.group(3) + " # " + matcher.group(4));
-		}
-		*/
-	}
-	
 }