Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | aeb29e362a67 |
children | e8cee8cf2f52 |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** * @author casties * * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit * */ public class ArabicTranslitNormalizer { protected static List<ReplacementPattern> apostrophePatterns = new ArrayList<ReplacementPattern>(); static { // `, ʿ, ʾ, ‘, ’ -> ' //apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019")); // remove apostrophes `, ʿ, ʾ, ‘, ’, ' apostrophePatterns.add(new ReplacementPattern("", Pattern.compile("'|\u0060|\u02BE|\u02BF|\u2018|\u2019"))); } protected static List<ReplacementPattern> twoletterPatterns = new ArrayList<ReplacementPattern>(); static { twoletterPatterns.add(new ReplacementPattern("j", Pattern.compile("ch"))); twoletterPatterns.add(new ReplacementPattern("j", Pattern.compile("dj"))); twoletterPatterns.add(new ReplacementPattern("t", Pattern.compile("th"))); twoletterPatterns.add(new ReplacementPattern("h", Pattern.compile("kh"))); twoletterPatterns.add(new ReplacementPattern("d", Pattern.compile("dh"))); twoletterPatterns.add(new ReplacementPattern("s", Pattern.compile("sh"))); twoletterPatterns.add(new ReplacementPattern("g", Pattern.compile("gh"))); } protected static List<ReplacementPattern> wordpartPatterns = new ArrayList<ReplacementPattern>(); static { // aẗ\b, at\b, ah\b -> a wordpartPatterns.add(new ReplacementPattern("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"))); // 'abd + space -> 'abd // now without apostrophe wordpartPatterns.add(new ReplacementPattern("abd", Pattern.compile("abd "))); } protected static List<ReplacementPattern> letterdiacritPatterns = new ArrayList<ReplacementPattern>(); static { // ỳ -> a letterdiacritPatterns.add(new ReplacementPattern("a", Pattern.compile("\u1EF3"))); } protected static List<ReplacementPattern> letterPatterns = new ArrayList<ReplacementPattern>(); static { letterPatterns.add(new ReplacementPattern("j", Pattern.compile("g|c"))); } protected static Pattern diacriticsPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); /** * Returns String normalized according to arabic transliteration rules. * * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit * * @param text * @return */ public static String normalize(String text) { if (StringUtils.isEmpty(text)) { return text; } // everything is lowercase TODO: locale? text = text.toLowerCase(); // replace "apostrophes" for (ReplacementPattern entry : apostrophePatterns) { text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace two-letter combinations for (ReplacementPattern entry : twoletterPatterns) { text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace word-parts for (ReplacementPattern entry : wordpartPatterns) { text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // replace letters with diacritics for (ReplacementPattern entry : letterdiacritPatterns) { text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } // remove diacritics by de-composing and removing diacritical marks text = Normalizer.normalize(text, Form.NFD); text = diacriticsPattern.matcher(text).replaceAll(""); // replace letters for (ReplacementPattern entry : letterPatterns) { text = entry.getPattern().matcher(text).replaceAll(entry.getReplacement()); } return text; } }