view src/main/java/org/mpi/openmind/repository/utils/OldNormalizerUtils.java @ 71:aeb29e362a67

New ArabicNormalizer. NormalizerUtils.normalize() now does both translit and arabic normalization. 108: arabic normalization is not applied Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/108
author casties
date Thu, 02 Feb 2017 17:58:52 +0100
parents
children
line wrap: on
line source

package org.mpi.openmind.repository.utils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

public class OldNormalizerUtils {

	public static Map<String, List<String>> wildCardStringMap = new HashMap<String, List<String>>();
	
	static{
		List<String> list;
		StringBuilder sb = new StringBuilder();
		
		list = new ArrayList<String>();
		Character c = 0x1E6F;
		sb.append(c);
		list.add(sb.toString());//ṯ
		list.add("th");
		wildCardStringMap.put("T", list);
		
		list = new ArrayList<String>();
		c = 0x1E2b;
		list.add(c + "");//ḫ
		list.add("kh");
		wildCardStringMap.put("H", list);
		
		list = new ArrayList<String>();
		c = 0x1E0f;
		list.add(c + "");//ḏ
		list.add("dh");
		wildCardStringMap.put("D", list);
		
		list = new ArrayList<String>();
		c = 0x0161;
		list.add(c + "");//š
		list.add("sh");
		wildCardStringMap.put("S", list);
		
		list = new ArrayList<String>();
		c = 0x0121;
		list.add(c + "");//ġ
		list.add("gh");
		wildCardStringMap.put("G", list);
		
		list = new ArrayList<String>();
		c = 0x1E97;
		list.add("a" + c + " ");//aẗSPACE
		list.add("at ");
		list.add("ah ");
		list.add("a ");
		wildCardStringMap.put("A ", list);
		
		list = new ArrayList<String>();
		c = 0x1ef3;
		list.add(c + "");//ỳ
		c = 0x00E1;
		list.add(c + "");//á
		c = 0x0101;
		list.add(c + "");//ā
		c = 0x00E0;
		list.add(c + "");//à
		/*
		//Chantal list for A
		c = 0x0065;
		list.add(c + "");//e
		c = 0x0101;
		list.add(c + "");//ā
		c = 0x00E2;
		list.add(c + "");//â
		*/
		wildCardStringMap.put("A", list);
		
		/*
		list = new ArrayList<String>();
		c = 0x0062;
		list.add(c + "");//b
		c = 0x0070;
		list.add(c + "");//p
		wildCardStringMap.put("B", list);
		*/
	}
	
	public static Map<String, Character[]> wildCardCharMap = new HashMap<String, Character[]>();

	// " ` ′ ‘ ’ ‛ ' ʻ ʼ ʽ ˋ ʾ ʿ
	public static Character[] apostrophes = {
			0x22, 0x60, 0x2032, 0x2018, 0x2019, 0x201B, 0x27, 0x2BB, 0x2BC, 0x2BD, 0x2CB, 0x2BE, 0x2BF };
	//IN: Aa Áá  Àà  Ââ  Ǎǎ  Ăă  Ãã  Ảả  Ȧȧ  Ạạ  Ää  Åå  Ḁḁ  Āā  Ąą  
	//OUT: ᶏ  Ⱥⱥ  Ȁȁ  Ấấ  Ầầ  Ẫẫ  Ẩẩ  Ậậ  Ắắ  Ằằ  Ẵẵ  Ẳẳ  Ặặ  Ǻǻ  Ǡǡ  Ǟǟ  Ȁȁ  Ȃȃ
	public static Character[] AList = {
			0x41, 0x61, 0xC1, 0xE1, 0xC0, 0xE0, 0xC2, 0xE2, 0x1CD, 
			0x1CE, 0x102, 0x103, 0xC3, 0xE3, 0x1EA2, 0x1EA3, 0x226, 
			0x227, 0x1EA0, 0x1EA1, 0xC4, 0xE4, 0xC5, 0xE5, 0x1E00, 
			0x1E01, 0x100, 0x101, 0x104, 0x105 };
	
	static{

		wildCardCharMap.put("", apostrophes);		
		wildCardCharMap.put("A", AList);
		
		//IN: Bb Ḃḃ  Ḅḅ  Ḇḇ  Ɓɓ  ʙ  Bb 
		//OUT: Ƃƃ  ᵬ  ᶀ  ʙ  Bb  ȸ Ƀƀ  
		Character[] BList = {
				0x42, 0x62, 0x1E02, 0x1E03, 0x1E04, 0x1E05, 0x1E06, 
				0x1E07, 0x181, 0x253, 0x299, 0xFF22, 0xFF42,
		};
		wildCardCharMap.put("B", BList);
		
		//Ćć  Ĉĉ  Čč  Ċċ  C̄c̄  Ç(ç problem with this)  Ḉḉ  Ȼȼ  Ƈƈ  ɕ  ᴄ  Cc
		Character[] CList = {
				0x43, 0x63, 0x106, 0x107, 0x108, 0x109, 0x10C, 0x10D, 
				0x10A, 0x10B, 0x43, 0xC7, 0xE7, 0x1E08, 0x1E09, 0x23B, 
				0x23C, 0x187, 0x188, 0x255, 0x1D04, 0xFF23, 0xFF43
		};
		wildCardCharMap.put("C", CList);
		
		//IN: Dd Ďď  Ḋḋ  Ḑḑ  Ḍḍ  Ḓḓ  Ḏḏ  Dd  
		//OUT: Đđ  D̦d̦  Ɖɖ  Ɗɗ  Ƌƌ  ᵭ  ᶁ  ᶑ  ȡ  ᴅ
		Character[] DList = {
				0x44, 0x64, 0x10E, 0x10F, 0x1E0A, 0x1E0B, 0x1E10, 
				0x1E11, 0x1E0C, 0x1E0D, 0x1E12, 0x1E13, 0x1E0E, 
				0x1E0F, 0xFF24, 0xFF44
		};
		wildCardCharMap.put("D", DList);

		//IN: Ee Éé  Èè  Êê  Ḙḙ  Ěě  Ĕĕ  Ẽẽ  Ḛḛ  Ẻẻ  Ėė  Ëë  Ēē  Ȩȩ  Ęę  Ȅȅ  Ếế  Ềề  Ễễ  Ểể  Ḝḝ  Ḗḗ  Ḕḕ  Ȇȇ  Ẹẹ  Ệệ ᴇ  Ee  
		//OUT: Ææ  Ǽǽ  Ǣǣ  Œœ ᶒ  Ɇɇ
		Character[] EList = {
				0x45, 0x65, 0xC9, 0xE9, 0xC8, 0xE8, 0xCA, 0xEA, 
				0x1E18, 0x1E19, 0x11A, 0x11B, 0x114, 0x115, 
				0x1EBC, 0x1EBD, 0x1E1A, 0x1E1B, 0x1EBA, 0x1EBB, 
				0x116, 0x117, 0xCB, 0xEB, 0x112, 0x113, 0x228, 
				0x229, 0x118, 0x119, 0x204, 0x205, 0x1EBE, 0x1EBF,
				0x1EC0, 0x1EC1, 0x1EC4, 0x1EC5, 0x1EC2, 0x1EC3,
				0x1E1C, 0x1E1D, 0x1E16, 0x1E17, 0x1E14, 0x1E15, 
				0x206, 0x207, 0x1EB8, 0x1EB9, 0x1EC6, 0x1EC7, 
				0x1D07, 0xFF25, 0xFF45
		};
		wildCardCharMap.put("E", EList);
		
		//Ii Íí  Ìì  Ĭĭ  Îî  Ǐǐ  Ïï  Ḯḯ  Ĩĩ  Įį  Īī  Ỉỉ  Ȉȉ  Ȋȋ  Ịị  Ḭḭ
		Character[] IList = {
				0x49, 0x69, 0xCD, 0xED, 0xCC, 0xEC, 0x12C, 0x12D, 0xCE, 
				0xEE, 0x1CF, 0x1D0, 0xCF, 0xEF, 0x1E2E, 0x1E2F, 0x128, 
				0x129, 0x12E, 0x12F, 0x12A, 0x12B, 0x1EC8, 0x1EC9, 0x208, 
				0x209, 0x20A, 0x20B, 0x1ECA, 0x1ECB, 0x1E2C, 0x1E2D 
		};
		wildCardCharMap.put("I", IList);
		
		//IN: Gg Ǵǵ  Ğğ  Ĝĝ  Ǧǧ  Ġġ  Ģģ  Ḡḡ  Ǥǥ  Gg 
		//OUT: Ɠɠ  ᶃ  ɢ 
		Character[] GList = {
				0x47, 0x67, 0x1F4, 0x1F5, 0x11E, 0x11F, 0x11C, 0x11D, 
				0x1E6, 0x1E7, 0x120, 0x121, 0x122, 0x123, 0x1E20, 0x1E21, 
				0x1E4, 0x1E5, 0xFF27, 0xFF47
		};
		wildCardCharMap.put("G", GList);
		
		//Nn Ńń  Ǹǹ  Ňň  Ññ  Ṅṅ  Ņņ  Ṇṇ  Ṋṋ  Ṉṉ
		Character[] NList = {
				0x4E, 0x6E, 0x143, 0x144, 0x1F8, 0x1F9, 0x147, 0x148, 
				0xD1, 0xF1, 0x1E44, 0x1E45, 0x145, 0x146, 0x1E46, 
				0x1E47, 0x1E4A, 0x1E4B, 0x1E48, 0x1E49
		};
		wildCardCharMap.put("N", NList);
		
		//H h Ĥ ĥ Ȟ ȟ Ḧ ḧ Ḣ ḣ Ḩ ḩ Ḥ ḥ Ḫ ḫ H ̱ ẖ Ħ ħ Ⱨ ⱨ
		Character[] HList = {
				0x48, 0x68, 0x124, 0x125, 0x21E, 0x21F, 0x1E26, 0x1E27, 
				0x1E22, 0x1E23, 0x1E28, 0x1E29, 0x1E24, 0x1E25, 0x1E2A, 
				0x1E2B, 0x48, 0x1E96, 0x126, 0x127, 0x2C67, 0x2C68
		};
		wildCardCharMap.put("H", HList);
		
		//Oo  Óó  Òò  Ŏŏ  Ôô  Ốố  Ồồ  Ỗỗ  Ổổ  Ǒǒ  Öö  Ȫȫ  Őő  Õõ  Ṍṍ  Ṏṏ  Ȭȭ  Ȯȯ  Ȱȱ  Øø  Ǿǿ  Ǫǫ  Ǭǭ  Ōō  Ṓṓ  Ṑṑ  Ỏỏ  Ȍȍ  Ȏȏ  Ơơ  Ớớ  Ờờ  Ỡỡ  Ởở  Ợợ  Ọọ  Ộộ
		Character[] OLIST = {
				0x4F, 0x6F, 0xD3, 0xF3, 0xD2, 0xF2, 0x14E, 0x14F, 0xD4, 
				0xF4, 0x1ED0, 0x1ED1, 0x1ED2, 0x1ED3, 0x1ED6, 0x1ED7, 
				0x1ED4, 0x1ED5, 0x1D1, 0x1D2, 0xD6, 0xF6, 0x22A, 0x22B, 
				0x150, 0x151, 0xD5, 0xF5, 0x1E4C, 0x1E4D, 0x1E4E, 0x1E4F, 
				0x22C, 0x22D, 0x22E, 0x22F, 0x230, 0x231, 0xD8, 0xF8, 0x1FE, 
				0x1FF, 0x1EA, 0x1EB, 0x1EC, 0x1ED, 0x14C, 0x14D, 0x1E52, 
				0x1E53, 0x1E50, 0x1E51, 0x1ECE, 0x1ECF, 0x20C, 0x20D, 
				0x20E, 0x20F, 0x1A0, 0x1A1, 0x1EDA, 0x1EDB, 0x1EDC, 0x1EDD, 
				0x1EE0, 0x1EE1, 0x1EDE, 0x1EDF, 0x1EE2, 0x1EE3, 0x1ECC, 
				0x1ECD, 0x1ED8, 0x1ED9
		};
		wildCardCharMap.put("O", OLIST);
		
		Character[] RList = {
				0x52, 0x72, 0x154, 0x155, 0x158, 0x159, 0x1E58, 0x1E59, 
				0x156, 0x157, 0x210, 0x211, 0x212, 0x213, 0x1E5A, 0x1E5B, 
				0x1E5C, 0x1E5D, 0x1E5E, 0x1E5F, 0x27C, 0x27E, 0x280, 0xFF32, 0xFF52
		};
		wildCardCharMap.put("R", RList);
		
		
		//IN: Ss Śś  Ṥṥ  Ŝŝ  Šš  Ṧṧ  Ṡṡẛ  Şş  Ṣṣ  Ṩṩ  Șș  S̩̩  
		//OUT: ᵴ  ᶊ  ʂ  ȿ  ꜱ  Ss s
		Character[] SList = {
				0x53, 0x73, 0x15A, 0x15B, 0x1E64, 0x1E65, 0x15C, 0x15D, 
				0x160, 0x161, 0x1E66, 0x1E67, 0x1E60, 0x1E61, 0x15E, 0x15F, 
				0x1E62, 0x1E63, 0x1E68, 0x1E69, 0x218, 0x219, 0x53
		};
		wildCardCharMap.put("S", SList);
		
		
		//IN: Tt Ťť  Ṫṫ  Ţţ  Ṭṭ  Țț  Ṱṱ  Ṯṯ Tt
		//OUT: Ŧŧ  Ⱦⱦ  Ƭƭ  Ʈʈ  T̈ẗ  ᵵ  ƫ  ȶ  ᶙ  ᴛ
		Character[] TList = {
				0x54, 0x74, 0x164, 0x165, 0x1E6A, 0x1E6B, 0x162, 0x163, 
				0x1E6C, 0x1E6D, 0x21A, 0x21B, 0x1E70, 0x1E71, 0x1E6E, 
				0x1E6F, 0xFF34, 0xFF54
		};
		wildCardCharMap.put("T", TList);
		
		//IN: Uu Úú  Ùù  Ŭŭ  Ûû  Ǔǔ  Ůů  Üü  Ǘǘ  Ǜǜ  Ǚǚ  Ǖǖ  Űű  Ũũ  Ṹṹ  Ųų  Ūū  
		//OUT: Ṻṻ  Ủủ  Ȕȕ  Ȗȗ  Ưư  Ứứ  Ừừ  Ữữ  Ửử  Ựự  Ụụ  Ṳṳ  Ṷṷ  Ṵṵ  Ʉʉ  ᵾ  ᶙ  ᴜ  Uu
		Character[] UList ={
				0x55, 0x75, 0xDA, 0xFA, 0xD9, 0xF9, 0x16C, 0x16D, 0xDB, 0xFB, 0x1D3, 
				0x1D4, 0x16E, 0x16F, 0xDC, 0xFC, 0x1D7, 0x1D8, 0x1DB, 0x1DC, 0x1D9, 
				0x1DA, 0x1D5, 0x1D6, 0x170, 0x171, 0x168, 0x169, 0x1E78, 0x1E79, 
				0x172, 0x173, 0x16A, 0x16B	
		};
		wildCardCharMap.put("U", UList);
		
		Character[] VList = {
				0x1E7C, 0x1E7D, 0x1E7E, 0x1E7F, 0x1B2, 
				0x28B, 0x1D20, 0xFF36, 0xFF56
		};
		wildCardCharMap.put("V", VList);
		
		//IN: Zz Źź  Ẑẑ  Žž  Żż  Ẓẓ  Ẕẕ  Ƶƶ  Ȥȥ  
		//OUT: Ⱬⱬ  ᵶ  ᶎ  ʐ  ʑ  ɀ  ᴢ  Zz
		Character[] ZList = {
				0x5A, 0x7A, 0x179, 0x17A, 0x1E90, 0x1E91, 0x17D, 
				0x17E, 0x17B, 0x17C, 0x1E92, 0x1E93, 0x1E94, 
				0x1E95, 0x1B5, 0x1B6, 0x1D22, 0xFF3A, 0xFF5A
		};
		wildCardCharMap.put("Z", ZList);
	}
	
	public static String normalize(String w) {
	    return ArabicTranslitNormalizer.normalize(w);
	}
	
	public static String old_normalize(String w){
		if(StringUtils.isEmpty(w))
			return w;
		
		w = w.toLowerCase();
		/*
		 * Replacing combination of vowels
		 */
		for(String key : wildCardStringMap.keySet()){
			List<String> list = wildCardStringMap.get(key);
			for(String term : list){
				w = w.replace(term, key);
			}
		}
		
		for(String key : wildCardCharMap.keySet()){
			Character[] list = wildCardCharMap.get(key);
			for(int i=0; i< list.length; i++){
				w = w.replace(list[i] + "", key);
			}
		}
		return w.toLowerCase();
	}
	
	public static String normalizedToCompare(String s1){
    	s1 = s1.replace("#", "");
    	s1 = s1.replace("-", "");
    	s1 = s1.replace("(", "");
    	s1 = s1.replace(")", "");
    	s1 = s1.replace("[", "");
    	s1 = s1.replace("]", "");
    	s1 = s1.replace("_", "");
        
    	return s1;
	}
	
	public static void main(String[] args){
		String s = OldNormalizerUtils.normalize("ṯ");
		System.out.println(s);
	}
}