changeset 15:728549225b02

first version of new normalizer.
author casties
date Tue, 12 May 2015 08:32:23 +0000
parents 034df8d5c923
children c009ce2e60be
files src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java
diffstat 1 files changed, 22 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java	Tue May 12 08:32:23 2015 +0000
@@ -0,0 +1,22 @@
+package org.mpi.openmind.repository.utils;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+public class ArabicTranslitNormalizer {
+
+    protected static Map<String, Pattern> multiRepPat = new HashMap<String, Pattern>();
+    {
+        multiRepPat.put("j", Pattern.compile("ch"));
+        multiRepPat.put("j", Pattern.compile("dj"));
+        multiRepPat.put("t", Pattern.compile("th"));
+        multiRepPat.put("h", Pattern.compile("kh"));
+        multiRepPat.put("d", Pattern.compile("dh"));
+        multiRepPat.put("s", Pattern.compile("sh"));
+        multiRepPat.put("g", Pattern.compile("gh"));
+        multiRepPat.put("j", Pattern.compile("ch"));
+        // aẗ\b, at\b, ah\b -> a 
+        multiRepPat.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"));
+    }
+}