view src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizerUtils.java @ 71:aeb29e362a67

New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization. 108: arabic normalization is not applied Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/108
author casties
date Thu, 02 Feb 2017 17:58:52 +0100
parents 615d27dce9b3
children
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

public class ArabicNormalizerUtils {

	
	public static List<Character> ignoreList;
	
	static{
		Character[] array = {
				0x064B, 0x064C, 0x064D, 0x064E,
				0x064F, 0x0650, 0x0651, 0x0652,
				0x0670, 0x0671, 0x06E4, 0x06E4,
				0xE818, 0xE820, 0xE821, 0xE822,
				0xE823, 0xE824, 0xE825, 0xE826,
				0xE827, 0xE828, 0xE829, 0xE82A,
				0xE82B, 0xE82C, 0xE82D, 0xE832,
				0xE833, 0xE834, 0xE835, 0xE836,
				0xFB50, 0xFB51, 0xFC5E, 0xFC5F,
				0xFC60, 0xFC61, 0xFC62, 0xFE70,
				0xFE72, 0xFE74, 0xFE76, 0xFE78,
				0xFE7A, 0xFE7C, 0xFE7E
		};
		ignoreList = Arrays.asList(array);
	}
	
	public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>();
	//public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>();
	
	static{
		
		Character[] apostrophes = {
				0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF
		};
		wildCardCharMap.put("", apostrophes);
		
		Character[] array1 = {
				0x0622, 0x0623, 0x0625, 0x0627  };
		wildCardCharMap.put("1", array1);
		
		Character[] array2 = {
				0x0626, 0x0649, 0x064A, 0x0649, 0x064A };
		wildCardCharMap.put("2", array2);
		
		Character[] array3 = {
				0x0648, 0x0624, 0x0648};
		wildCardCharMap.put("3", array3);
		
		Character[] array4 = {
				0x067E, 0x0628, 0x0628 };
		wildCardCharMap.put("4", array4);
		
		Character[] array5 = {
				0x0686, 0x062C, 0x062C};
		wildCardCharMap.put("5", array5);
		
		Character[] array6 = {
				0x0698, 0x0632, 0x0632};
		wildCardCharMap.put("6", array6);
		
		Character[] array7 = {
				0x06A4, 0x0641, 0x0641};
		wildCardCharMap.put("7", array7);
		
		Character[] array8 = {
				0x0643, 0x06A9, 0x06AF, 0x0643};
		wildCardCharMap.put("8", array8);
	}
	
	public static String normalize(String w){
		if(StringUtils.isEmpty(w))
			return w;
		
		/*
		 * Replacing combination of vowels
		 */
		
		for(String key : wildCardCharMap.keySet()){
			Character[] list = wildCardCharMap.get(key);
			for(int i=0; i< list.length; i++){
				w = w.replace(list[i] + "", key);
			}
		}
		return w;
	}
}