Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/TransliterationUtil.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | 615d27dce9b3 |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; public class TransliterationUtil { private static Logger logger = Logger.getLogger(TransliterationUtil.class); private static Map<Character, Character> CHARMAP = new HashMap<Character, Character>(); private static Map<String, String> CONVERSIONMAP = new HashMap<String, String>(); private static List<String> CONVERSIONLIST = new ArrayList<String>(); static { // Arabic / Latin addToCharMap('\u0623', '\u0061'); addToCharMap('\u0627', '\u0101'); addToCharMap('\u0622', '\u0101'); addToCharMap('\u0625', '\u0069'); addToCharMap('\u0628', '\u0062'); addToCharMap('\u062A', '\u0074'); addToCharMap('\u062B', '\u1E6F'); addToCharMap('\u062C', '\u006A'); addToCharMap('\u062D', '\u1E25'); addToCharMap('\u062E', '\u1E2B'); addToCharMap('\u062F', '\u0064'); addToCharMap('\u0630', '\u1E0F'); addToCharMap('\u0631', '\u0072'); addToCharMap('\u0632', '\u007A'); addToCharMap('\u0633', '\u0073'); addToCharMap('\u0634', '\u0161'); addToCharMap('\u0635', '\u1E63'); addToCharMap('\u0636', '\u1E0D'); addToCharMap('\u0637', '\u1E6D'); addToCharMap('\u0638', '\u1E93'); addToCharMap('\u0639', '\u02BF'); addToCharMap('\u063A', '\u0121'); addToCharMap('\u0641', '\u0066'); addToCharMap('\u0642', '\u0071'); addToCharMap('\u0643', '\u006B'); addToCharMap('\u0644', '\u006C'); addToCharMap('\u0645', '\u006D'); addToCharMap('\u0646', '\u006E'); addToCharMap('\u0647', '\u0068'); addToCharMap('\u0648', '\u0077'); addToCharMap('\u064A', '\u0079'); addToCharMap('\u0621', '\u02BE'); addToCharMap('\u0626', '\u02BE'); //Chantal said replace this //addToCharMap('\u0629', '\u0068'); addToCharMap('\u0629', '\u1E97'); addToCharMap('\u064E', '\u0061'); addToCharMap('\u0650', '\u0069'); // addToCharMap('\u0652',''); addToCharMap('\u064F', '\u0075'); //chantal start addToStringList("\u0020\u0627\u0644", "\u0020\u0061\u006C\u002D"); addToStringList("\u0629\u0020\u0627\u0644","\u0074\u0020\u0061\u006C\u002D"); addToStringList("\u0623\u064F\u0648\u0652", "\u016B"); addToStringList("\u064F\u0648\u0652", "\u016B"); addToStringList("\u0650\u064A\u0652", "\u012B"); //chantal end addToStringList("\u064F\u0648", "\u016B"); //addToStringList("\u0650\u064A", "\u012B"); no in the table table 2014.04 addToStringList("\u064E\u064A", "\u0061\u0079"); addToStringList("\u064E\u0648", "\u0061\u0077"); addToStringList("\u0652\u064A", "\u0079"); addToStringList("\u0652\u0648", "\u0077"); addToStringList("\u0020\u0627\u0644", "\u0020\u0061\u006C\u002D"); addToStringList("\u0629\u0020\u0627\u0644", "\u0074\u0020\u0061\u006C\u002D"); /* * addToCharMap('\u1E0d','\u0064');//ḍ -> d * addToCharMap('\u1E25','\u0068');//ḥ -> h * addToCharMap('\u012B','\u0069');//ī -> i * addToCharMap('\u1E63','\u0073');//ṣ -> s * addToCharMap('\u1E6D','\u0074');//ṭ -> t * addToCharMap('\u016b','\u0075');//ū -> u * addToCharMap('\u0101','\u0061');//ā -> a */ // Chantal Recommendations: addToCharMap('\u0649', '\u1EF3');// ى->ỳ addToCharMap('\u0624', '\u02BE');// ؤ->ʾ addToCharMap('\u0670', '\u0101'); // َٰ -> ā addToStringList("\u0623\u064E", "\u0061"); addToStringList("\u0625\u0650", "\u0069"); addToStringList("\u064E\u0649", "\u1EF3"); addToStringList("\u0623\u064F", "\u0075"); // أُ -> i addToStringList("\u064E\u0627", "\u0101"); // َا -> ā addToStringList("\u064B", "\u0061\u006E"); // ًَٰ-> an addToStringList("\u064D", "\u0069\u006E"); // ٍَٰ-> in addToStringList("\u064C", "\u0075\u006E"); // ٍَٰ-> un addToStringList("\u0652", ""); } private static String stringToUnicode(String s){ StringBuilder sb = new StringBuilder(); for(char ch : s.toCharArray()){ sb.append(toUnicode(ch)); } return sb.toString(); } private static String toUnicode(char ch) { return String.format("\\u%04x", (int) ch); } private static void addToCharMap(char arabCh, char latCh) { CHARMAP.put(arabCh, latCh); } private static void addToStringList(String arabStr, String latStr) { CONVERSIONMAP.put(arabStr, latStr); CONVERSIONLIST.add(arabStr); } public static String getTransliteration(final String text) { String replacementText = new String(text); replacementText = duplication(replacementText); /* * for (int i = 0; i < replacementText.length(); i++) { * replacementText.charAt(i); replacementText.codePointAt(i); } */ //System.out.println("^^^^^^^^^^^^^^^^"); for (String ar : CONVERSIONLIST) { String lat = CONVERSIONMAP.get(ar); /*System.out.println( stringToUnicode(ar) + "\n" + stringToUnicode(lat)); */ if (replacementText.contains(ar)) { //System.out.println("ok"); replacementText = replacementText.replace(ar, lat); } //System.out.println("--------------"); } //System.out.println("^^^^^^^^^^^"); for (char ar : CHARMAP.keySet()) { char lat = CHARMAP.get(ar); if (replacementText.indexOf(ar) > -1) { replacementText = replacementText.replace(ar, lat); } } //logger.debug("Transliteration from: " + text + ", to: " // + replacementText); return replacementText; } private static final Map<String, String> duplicationConsonantVowelMap; private static final Map<String, String> duplicationVowelConsonantMap; static { duplicationConsonantVowelMap = new HashMap<String, String>(); duplicationConsonantVowelMap.put("(.)\u0650\u0651", "\u0650"); duplicationConsonantVowelMap.put("(.)\u064B\u0651", "\u064B"); duplicationConsonantVowelMap.put("(.)\u064C\u0651", "\u064B"); duplicationConsonantVowelMap.put("(.)\u064D\u0651", "\u064D"); duplicationConsonantVowelMap.put("(.)\u064E\u0651", "\u064E"); duplicationConsonantVowelMap.put("(.)\u064F\u0651", "\u064F"); duplicationVowelConsonantMap = new HashMap<String, String>(); duplicationVowelConsonantMap.put("\u0650(.)\u0651", "\u0650"); duplicationVowelConsonantMap.put("\u064B(.)\u0651", "\u064B"); duplicationVowelConsonantMap.put("\u064C(.)\u0651", "\u064B"); duplicationVowelConsonantMap.put("\u064D(.)\u0651", "\u064D"); duplicationVowelConsonantMap.put("\u064E(.)\u0651", "\u064E"); duplicationVowelConsonantMap.put("\u064F(.)\u0651", "\u064F"); } private static String duplication(String text) { // for(String duplicationTerm : duplicationMap){ // text = text.replaceAll("(.)" + duplicationTerm, "$1$1"); // } // text = text.replaceAll("(.)(.)\u0651", "$1$1$2"); for (String key : duplicationConsonantVowelMap.keySet()) { text = text.replaceAll(key, "$1$1" + duplicationConsonantVowelMap.get(key)); } for (String key : duplicationVowelConsonantMap.keySet()) { text = text.replaceAll(key, duplicationVowelConsonantMap.get(key) + "$1$1"); } return text; } public static void printHexCharacters(String s) { for (char ch : s.toCharArray()) { String hex = String.format("%04x", (int) ch); System.out.println(hex); } } private static String test(String term) { TransliterationUtil.printHexCharacters(term); String s = TransliterationUtil.getTransliteration(term); System.out.println(s); TransliterationUtil.printHexCharacters(s); return s; } public static String changeCharInPosition(int position, char ch, String str) { char[] charArray = str.toCharArray(); charArray[position] = ch; return new String(charArray); } /** * @param args * @throws UnsupportedEncodingException */ public static void main(String[] args) throws UnsupportedEncodingException { // TransliterationUtil.getTransliteration("ṣīṭūāḥḍ"); // System.out.println(TransliterationUtil.getTransliteration("رسالة اسطرلاب")); // System.out.println(TransliterationUtil.getTransliteration("\u0101"+"رسالة اسطرلاب")); // System.out.println(TransliterationUtil.getTransliteration("أُصُوْل")); // TransliterationUtil.printHexCharacters("أُصُوْل"); // System.out.println(TransliterationUtil.duplication("abcd11")); // System.out.println(TransliterationUtil.getTransliteration("حُجَّة")); // TransliterationUtil.printHexCharacters("حُجَّة"); // System.out.println(TransliterationUtil.getTransliteration("مُحَمَّد")); // TransliterationUtil.printHexCharacters("مُحَمَّد"); // TransliterationUtil.test("مُسَمَّة"); // TransliterationUtil.test("حُبّ"); // TransliterationUtil.test("حُبٌّ"); // TransliterationUtil.test("الرَشِيْد"); /* * String s = TransliterationUtil.test("مَكْتُوب"); * TransliterationUtil.printHexCharacters(s); * System.out.println("******"); * TransliterationUtil.printHexCharacters("makْtūb"); */ //TransliterationUtil.test("أُوْلَى"); //TransliterationUtil.test("قِيَامَة"); //TransliterationUtil.printHexCharacters("قِيَامَة"); System.out.println("\u7831"); } }