Mercurial > hg > openmind
view src/main/java/org/mpi/openmind/repository/utils/NormalizerUtils.java @ 89:8adfa8679991
new implementation of translit-to-romanization rules in RomanizationLoc with test(!).
author | Robert Casties <casties@mpiwg-berlin.mpg.de> |
---|---|
date | Mon, 26 Feb 2018 14:39:49 +0100 |
parents | ad505ef703ed |
children |
line wrap: on
line source
package org.mpi.openmind.repository.utils; import java.text.Normalizer; import java.util.regex.Matcher; import java.util.regex.Pattern; public class NormalizerUtils { /** * Returns String normalized for searching arabic or transliterated arabic. * * @param w * @return */ public static String normalize(String w) { String atn = ArabicTranslitNormalizer.normalize(w); String an = ArabicNormalizer.normalize(atn); return an; } /** * Returns String normalized for searching arabic transliteration text. * * @see https://it-dev.mpiwg-berlin.mpg.de/tracs/OpenMind3/wiki/normalize_arabic_translit * * @param w * @return */ public static String normalizeArabicTranslit(String w) { return ArabicTranslitNormalizer.normalize(w); } /** * Returns String normalized for searching arabic. * * The normalization consists in removing vowels and other diacritic marks. * * @param w * @return */ public static String normalizeArabic(String w) { return ArabicNormalizer.normalize(w); } /** * Returns String in Unicode normalization (NFC). * * @param text * @return */ public static String unicodeNormalize(String text) { if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) { text = Normalizer.normalize(text, Normalizer.Form.NFC); } return text; } private static Pattern old_ayn_pattern = Pattern.compile("(\u2018|\u02BB)"); // ‘|ʻ private static String new_ayn = "\u02BF"; // ʿ private static Pattern old_hamza_pattern = Pattern.compile("(\u2019|\u02bc)"); // ’|ʼ private static String new_hamza = "\u02BE"; // ʾ /** * Normalize transliteration forms for ayn and hamza. * * @param text * @return */ public static String aynHamzaNormalizer(String text) { Matcher match_ayn = old_ayn_pattern.matcher(text); text = match_ayn.replaceAll(new_ayn); Matcher match_hamza = old_hamza_pattern.matcher(text); text = match_hamza.replaceAll(new_hamza); return text; } }