view src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizer.java @ 89:8adfa8679991

new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Mon, 26 Feb 2018 14:39:49 +0100
parents aeb29e362a67
children
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

/**
 * @author casties
 * 
 */
public class ArabicNormalizer {

	/** match all of Unicode mark category */
    protected static Pattern markPattern = Pattern.compile("\\p{M}+");
    
    /**
     * Returns String of normalized arabic.
     * 
     * Normalization means de-vowelisation using Unicode tables.
     * Removes all Unicode mark characters from decomposed form.
     * 
     * @param text
     * @return
     */
    public static String normalize(String text) {
        if (StringUtils.isEmpty(text)) {
            return text;
        }
        
        // remove vowels by de-composing and removing diacritical marks
        text = Normalizer.normalize(text, Form.NFKD);
        text = markPattern.matcher(text).replaceAll("");
        
        return text;
    }
}