view src/main/java/org/mpi/openmind/repository/utils/TransliterationUtil.java @ 89:8adfa8679991

new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Mon, 26 Feb 2018 14:39:49 +0100
parents 615d27dce9b3
children
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.io.UnsupportedEncodingException;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;

public class TransliterationUtil {

	private static Logger logger = Logger.getLogger(TransliterationUtil.class);

	private static Map<Character, Character> CHARMAP = new HashMap<Character, Character>();
	private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>();
	private static List<String> CONVERSIONLIST = new ArrayList<String>();

	static {
		// Arabic / Latin
		addToCharMap('\u0623', '\u0061');
		addToCharMap('\u0627', '\u0101');
		addToCharMap('\u0622', '\u0101');
		addToCharMap('\u0625', '\u0069');
		addToCharMap('\u0628', '\u0062');
		addToCharMap('\u062A', '\u0074');
		addToCharMap('\u062B', '\u1E6F');
		addToCharMap('\u062C', '\u006A');
		addToCharMap('\u062D', '\u1E25');
		addToCharMap('\u062E', '\u1E2B');
		addToCharMap('\u062F', '\u0064');
		addToCharMap('\u0630', '\u1E0F');
		addToCharMap('\u0631', '\u0072');
		addToCharMap('\u0632', '\u007A');
		addToCharMap('\u0633', '\u0073');
		addToCharMap('\u0634', '\u0161');
		addToCharMap('\u0635', '\u1E63');
		addToCharMap('\u0636', '\u1E0D');
		addToCharMap('\u0637', '\u1E6D');
		addToCharMap('\u0638', '\u1E93');
		addToCharMap('\u0639', '\u02BF');
		addToCharMap('\u063A', '\u0121');
		addToCharMap('\u0641', '\u0066');
		addToCharMap('\u0642', '\u0071');
		addToCharMap('\u0643', '\u006B');
		addToCharMap('\u0644', '\u006C');
		addToCharMap('\u0645', '\u006D');
		addToCharMap('\u0646', '\u006E');
		addToCharMap('\u0647', '\u0068');
		addToCharMap('\u0648', '\u0077');
		addToCharMap('\u064A', '\u0079');
		addToCharMap('\u0621', '\u02BE');
		addToCharMap('\u0626', '\u02BE');
		
		//Chantal said replace this 
		//addToCharMap('\u0629', '\u0068');
		addToCharMap('\u0629', '\u1E97');
		
		addToCharMap('\u064E', '\u0061');
		addToCharMap('\u0650', '\u0069');
		// addToCharMap('\u0652','');
		addToCharMap('\u064F', '\u0075');

		
		//chantal start
		addToStringList("\u0020\u0627\u0644", "\u0020\u0061\u006C\u002D");
		addToStringList("\u0629\u0020\u0627\u0644","\u0074\u0020\u0061\u006C\u002D");
		addToStringList("\u0623\u064F\u0648\u0652", "\u016B");
		addToStringList("\u064F\u0648\u0652", "\u016B");
		addToStringList("\u0650\u064A\u0652", "\u012B");
		//chantal end
		
		addToStringList("\u064F\u0648", "\u016B");
		//addToStringList("\u0650\u064A", "\u012B"); no in the table table 2014.04
		addToStringList("\u064E\u064A", "\u0061\u0079");
		addToStringList("\u064E\u0648", "\u0061\u0077");
		addToStringList("\u0652\u064A", "\u0079");
		addToStringList("\u0652\u0648", "\u0077");
		addToStringList("\u0020\u0627\u0644", "\u0020\u0061\u006C\u002D");
		addToStringList("\u0629\u0020\u0627\u0644",
				"\u0074\u0020\u0061\u006C\u002D");

		/*
		 * addToCharMap('\u1E0d','\u0064');//ḍ -> d
		 * addToCharMap('\u1E25','\u0068');//ḥ -> h
		 * addToCharMap('\u012B','\u0069');//ī -> i
		 * addToCharMap('\u1E63','\u0073');//ṣ -> s
		 * addToCharMap('\u1E6D','\u0074');//ṭ -> t
		 * addToCharMap('\u016b','\u0075');//ū -> u
		 * addToCharMap('\u0101','\u0061');//ā -> a
		 */

		// Chantal Recommendations:
		addToCharMap('\u0649', '\u1EF3');// ى->ỳ
		addToCharMap('\u0624', '\u02BE');// ؤ->ʾ
		addToCharMap('\u0670', '\u0101'); // َٰ -> ā

		
		addToStringList("\u0623\u064E", "\u0061");
		addToStringList("\u0625\u0650", "\u0069");
		addToStringList("\u064E\u0649", "\u1EF3");
		addToStringList("\u0623\u064F", "\u0075"); // أُ -> i
		addToStringList("\u064E\u0627", "\u0101"); // َا -> ā
		addToStringList("\u064B", "\u0061\u006E"); // ًَٰ-> an
		addToStringList("\u064D", "\u0069\u006E"); // ٍَٰ-> in
		addToStringList("\u064C", "\u0075\u006E"); // ٍَٰ-> un
		addToStringList("\u0652", "");

	}

	private static String stringToUnicode(String s){
		StringBuilder sb = new StringBuilder();
		for(char ch : s.toCharArray()){
			sb.append(toUnicode(ch));
		}
		return sb.toString();
	}
	
	private static String toUnicode(char ch) {
		return String.format("\\u%04x", (int) ch);
	}

	private static void addToCharMap(char arabCh, char latCh) {
		CHARMAP.put(arabCh, latCh);
	}

	private static void addToStringList(String arabStr, String latStr) {
		CONVERSIONMAP.put(arabStr, latStr);
		CONVERSIONLIST.add(arabStr);
	}

	public static String getTransliteration(final String text) {

		String replacementText = new String(text);
		replacementText = duplication(replacementText);
		/*
		 * for (int i = 0; i < replacementText.length(); i++) {
		 * replacementText.charAt(i); replacementText.codePointAt(i); }
		 */
		//System.out.println("^^^^^^^^^^^^^^^^");
		for (String ar : CONVERSIONLIST) {
			String lat = CONVERSIONMAP.get(ar);
			/*System.out.println(
					stringToUnicode(ar)
					+ "\n" + 
					stringToUnicode(lat));
					*/
			if (replacementText.contains(ar)) {
				//System.out.println("ok");
				replacementText = replacementText.replace(ar, lat);
			}
			//System.out.println("--------------");			
		}
		//System.out.println("^^^^^^^^^^^");
		for (char ar : CHARMAP.keySet()) {
			char lat = CHARMAP.get(ar);
			if (replacementText.indexOf(ar) > -1) {
				replacementText = replacementText.replace(ar, lat);
			}
		}

		//logger.debug("Transliteration from: " + text + ", to: "
		//		+ replacementText);
		return replacementText;
	}

	private static final Map<String, String> duplicationConsonantVowelMap;
	private static final Map<String, String> duplicationVowelConsonantMap;

	static {
		duplicationConsonantVowelMap = new HashMap<String, String>();
		duplicationConsonantVowelMap.put("(.)\u0650\u0651", "\u0650");
		duplicationConsonantVowelMap.put("(.)\u064B\u0651", "\u064B");
		duplicationConsonantVowelMap.put("(.)\u064C\u0651", "\u064B");
		duplicationConsonantVowelMap.put("(.)\u064D\u0651", "\u064D");
		duplicationConsonantVowelMap.put("(.)\u064E\u0651", "\u064E");
		duplicationConsonantVowelMap.put("(.)\u064F\u0651", "\u064F");

		duplicationVowelConsonantMap = new HashMap<String, String>();
		duplicationVowelConsonantMap.put("\u0650(.)\u0651", "\u0650");
		duplicationVowelConsonantMap.put("\u064B(.)\u0651", "\u064B");
		duplicationVowelConsonantMap.put("\u064C(.)\u0651", "\u064B");
		duplicationVowelConsonantMap.put("\u064D(.)\u0651", "\u064D");
		duplicationVowelConsonantMap.put("\u064E(.)\u0651", "\u064E");
		duplicationVowelConsonantMap.put("\u064F(.)\u0651", "\u064F");

	}

	private static String duplication(String text) {
		// for(String duplicationTerm : duplicationMap){
		// text = text.replaceAll("(.)" + duplicationTerm, "$1$1");
		// }
		// text = text.replaceAll("(.)(.)\u0651", "$1$1$2");
		for (String key : duplicationConsonantVowelMap.keySet()) {
			text = text.replaceAll(key,
					"$1$1" + duplicationConsonantVowelMap.get(key));
		}

		for (String key : duplicationVowelConsonantMap.keySet()) {
			text = text.replaceAll(key, duplicationVowelConsonantMap.get(key)
					+ "$1$1");
		}

		return text;
	}

	public static void printHexCharacters(String s) {
		for (char ch : s.toCharArray()) {
			String hex = String.format("%04x", (int) ch);
			System.out.println(hex);
		}

	}

	private static String test(String term) {
		TransliterationUtil.printHexCharacters(term);
		String s = TransliterationUtil.getTransliteration(term);
		System.out.println(s);
		TransliterationUtil.printHexCharacters(s);
		return s;
	}

	public static String changeCharInPosition(int position, char ch, String str) {
		char[] charArray = str.toCharArray();
		charArray[position] = ch;
		return new String(charArray);
	}

	/**
	 * @param args
	 * @throws UnsupportedEncodingException
	 */
	public static void main(String[] args) throws UnsupportedEncodingException {
		// TransliterationUtil.getTransliteration("ṣīṭūāḥḍ");
		// System.out.println(TransliterationUtil.getTransliteration("رسالة اسطرلاب"));
		// System.out.println(TransliterationUtil.getTransliteration("\u0101"+"رسالة اسطرلاب"));
		// System.out.println(TransliterationUtil.getTransliteration("أُصُوْل"));
		// TransliterationUtil.printHexCharacters("أُصُوْل");

		// System.out.println(TransliterationUtil.duplication("abcd11"));

		// System.out.println(TransliterationUtil.getTransliteration("حُجَّة"));
		// TransliterationUtil.printHexCharacters("حُجَّة");

		// System.out.println(TransliterationUtil.getTransliteration("مُحَمَّد"));
		// TransliterationUtil.printHexCharacters("مُحَمَّد");
		// TransliterationUtil.test("مُسَمَّة");
		// TransliterationUtil.test("حُبّ");
		// TransliterationUtil.test("حُبٌّ");
		// TransliterationUtil.test("الرَشِيْد");

		/*
		 * String s = TransliterationUtil.test("مَكْتُوب");
		 * TransliterationUtil.printHexCharacters(s);
		 * System.out.println("******");
		 * TransliterationUtil.printHexCharacters("makْtūb");
		 */

		//TransliterationUtil.test("أُوْلَى");
		//TransliterationUtil.test("قِيَامَة");
		//TransliterationUtil.printHexCharacters("قِيَامَة");
		
		System.out.println("\u7831");

	}
}