Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/ArabicNormalizerUtils.java @ 71:aeb29e362a67
New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization.
108: arabic normalization is not applied
Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/108
author | casties |
---|---|
date | Thu, 02 Feb 2017 17:58:52 +0100 |
parents | 615d27dce9b3 |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; public class ArabicNormalizerUtils { public static List<Character> ignoreList; static{ Character[] array = { 0x064B, 0x064C, 0x064D, 0x064E, 0x064F, 0x0650, 0x0651, 0x0652, 0x0670, 0x0671, 0x06E4, 0x06E4, 0xE818, 0xE820, 0xE821, 0xE822, 0xE823, 0xE824, 0xE825, 0xE826, 0xE827, 0xE828, 0xE829, 0xE82A, 0xE82B, 0xE82C, 0xE82D, 0xE832, 0xE833, 0xE834, 0xE835, 0xE836, 0xFB50, 0xFB51, 0xFC5E, 0xFC5F, 0xFC60, 0xFC61, 0xFC62, 0xFE70, 0xFE72, 0xFE74, 0xFE76, 0xFE78, 0xFE7A, 0xFE7C, 0xFE7E }; ignoreList = Arrays.asList(array); } public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>(); //public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>(); static{ Character[] apostrophes = { 0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF }; wildCardCharMap.put("", apostrophes); Character[] array1 = { 0x0622, 0x0623, 0x0625, 0x0627 }; wildCardCharMap.put("1", array1); Character[] array2 = { 0x0626, 0x0649, 0x064A, 0x0649, 0x064A }; wildCardCharMap.put("2", array2); Character[] array3 = { 0x0648, 0x0624, 0x0648}; wildCardCharMap.put("3", array3); Character[] array4 = { 0x067E, 0x0628, 0x0628 }; wildCardCharMap.put("4", array4); Character[] array5 = { 0x0686, 0x062C, 0x062C}; wildCardCharMap.put("5", array5); Character[] array6 = { 0x0698, 0x0632, 0x0632}; wildCardCharMap.put("6", array6); Character[] array7 = { 0x06A4, 0x0641, 0x0641}; wildCardCharMap.put("7", array7); Character[] array8 = { 0x0643, 0x06A9, 0x06AF, 0x0643}; wildCardCharMap.put("8", array8); } public static String normalize(String w){ if(StringUtils.isEmpty(w)) return w; /* * Replacing combination of vowels */ for(String key : wildCardCharMap.keySet()){ Character[] list = wildCardCharMap.get(key); for(int i=0; i< list.length; i++){ w = w.replace(list[i] + "", key); } } return w; } }