Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizer.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | aeb29e362a67 |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; /** * @author casties * */ public class ArabicNormalizer { /** match all of Unicode mark category */ protected static Pattern markPattern = Pattern.compile("\\p{M}+"); /** * Returns String of normalized arabic. * * Normalization means de-vowelisation using Unicode tables. * Removes all Unicode mark characters from decomposed form. * * @param text * @return */ public static String normalize(String text) { if (StringUtils.isEmpty(text)) { return text; } // remove vowels by de-composing and removing diacritical marks text = Normalizer.normalize(text, Form.NFKD); text = markPattern.matcher(text).replaceAll(""); return text; } }