view src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java @ 89:8adfa8679991

new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author Robert Casties <casties@mpiwg-berlin.mpg.de>
date Mon, 26 Feb 2018 14:39:49 +0100
parents ad505ef703ed
children
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.text.Normalizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NormalizerUtils {
	
	/**
	 * Returns String normalized for searching arabic or transliterated arabic.
	 * 
	 * @param w
	 * @return
	 */
	public static String normalize(String w) {
		String atn = ArabicTranslitNormalizer.normalize(w);
		String an = ArabicNormalizer.normalize(atn);
	    return an;
	}
	
    /**
     * Returns String normalized for searching arabic transliteration text.
     * 
     * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit
     * 
     * @param w
     * @return
     */
	public static String normalizeArabicTranslit(String w) {
	    return ArabicTranslitNormalizer.normalize(w);		
	}
	
	/**
	 * Returns String normalized for searching arabic.
	 * 
	 * The normalization consists in removing vowels and other diacritic marks.
	 * 
	 * @param w
	 * @return
	 */
	public static String normalizeArabic(String w) {
		return ArabicNormalizer.normalize(w);
	}
	
	/**
	 * Returns String in Unicode normalization (NFC).
	 * 
	 * @param text
	 * @return
	 */
	public static String unicodeNormalize(String text) {
	    if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
	        text = Normalizer.normalize(text, Normalizer.Form.NFC);
	    }
	    return text;
	}
	
	private static Pattern old_ayn_pattern = Pattern.compile("(\u2018|\u02BB)"); // ‘|ʻ
	private static String new_ayn = "\u02BF"; // ʿ
    private static Pattern old_hamza_pattern = Pattern.compile("(\u2019|\u02bc)"); // ’|ʼ
    private static String new_hamza = "\u02BE"; // ʾ
	
	/**
	 * Normalize transliteration forms for ayn and hamza.
	 * 
	 * @param text
	 * @return
	 */
	public static String aynHamzaNormalizer(String text) {
	    Matcher match_ayn = old_ayn_pattern.matcher(text);
	    text = match_ayn.replaceAll(new_ayn);	    
        Matcher match_hamza = old_hamza_pattern.matcher(text);
        text = match_hamza.replaceAll(new_hamza);      
	    return text;
	}
}