changeset 69:bde6212babfd

106: translit normalization should ignore all '-equivalent chars Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/106
author casties
date Wed, 01 Feb 2017 19:54:12 +0100
parents 677492395dc0
children b5a22b9ab9c6
files src/main/java/org/mpi/openmind/cache/WrapperService.java src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java src/main/java/org/mpi/openmind/scripts/NormalizeOW.java
diffstat 3 files changed, 17 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/cache/WrapperService.java	Wed Feb 01 12:22:43 2017 +0100
+++ b/src/main/java/org/mpi/openmind/cache/WrapperService.java	Wed Feb 01 19:54:12 2017 +0100
@@ -25,6 +25,7 @@
 import org.mpi.openmind.repository.services.PersistenceService;
 import org.mpi.openmind.repository.services.utils.AttributeFilter;
 import org.mpi.openmind.repository.services.utils.EditIntent;
+import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer;
 import org.mpi.openmind.repository.utils.ImportOM3Util;
 import org.mpi.openmind.repository.utils.NormalizerUtils;
 import org.mpi.openmind.repository.utils.RomanizationLoC;
@@ -151,26 +152,23 @@
         txLog.debug("** END remove entity: user="+user+" entity="+entity.toSmallString());
 	}
 
-	public Map<Entity, Attribute> searchEntityByAttributeFilter0(String term,
-			List<AttributeFilter> filters, int maxResults) {
+	public Map<Entity, Attribute> searchEntityByAttributeFilter(String term, List<AttributeFilter> filters,
+			int maxResults) {
 		Map<Entity, Attribute> map = new HashMap<Entity, Attribute>();
 		List<Long> usedIds = new ArrayList<Long>();
 
 		boolean mustBreak = false;
 		int count = 0;
 		if (StringUtils.isNotEmpty(term)) {
-			String normalizedTerm = NormalizerUtils.normalize(term);
+			// TODO: better normalization
+			String normalizedTerm = ArabicTranslitNormalizer.normalize(term);
 			for (AttributeFilter filter : filters) {
 				if (mustBreak) {
 					break;
 				}
-				for (Attribute att : getAttributesByDefByAttName(
-						filter.getEntObjectClass(), filter.getName(), -1)) {
-					if (!usedIds.contains(att.getSourceId())
-							&& StringUtils.isNotEmpty(att
-									.getNormalizedOwnValue())
-							&& att.getNormalizedOwnValue().contains(
-									normalizedTerm)) {
+				for (Attribute att : getAttributesByDefByAttName(filter.getEntObjectClass(), filter.getName(), -1)) {
+					if (!usedIds.contains(att.getSourceId()) && StringUtils.isNotEmpty(att.getNormalizedOwnValue())
+							&& att.getNormalizedOwnValue().contains(normalizedTerm)) {
 						map.put(getEntityById(att.getSourceId()), att);
 						usedIds.add(att.getSourceId());
 						count++;
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java	Wed Feb 01 12:22:43 2017 +0100
+++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java	Wed Feb 01 19:54:12 2017 +0100
@@ -18,7 +18,9 @@
     protected static Map<String, Pattern> apostrophePatterns = new HashMap<String, Pattern>();
     static {
         // `, ʿ, ʾ, ‘, ’ -> '
-        apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019"));
+        //apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019"));
+    	// remove `, ʿ, ʾ, ‘, ’, '
+        apostrophePatterns.put("", Pattern.compile("'|\u0060|\u02BE|\u02BF|\u2018|\u2019"));
     }    
     
     protected static Map<String, Pattern> twoletterPatterns = new HashMap<String, Pattern>();
@@ -37,7 +39,8 @@
         // aẗ\b, at\b, ah\b -> a
         wordpartPatterns.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b"));
         // 'abd + space -> 'abd
-        wordpartPatterns.put("'abd", Pattern.compile("'abd "));
+        //wordpartPatterns.put("'abd", Pattern.compile("'abd "));
+        wordpartPatterns.put("abd", Pattern.compile("abd "));
     }
 
     protected static Map<String, Pattern> letterdiacritPatterns = new HashMap<String, Pattern>();
--- a/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java	Wed Feb 01 12:22:43 2017 +0100
+++ b/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java	Wed Feb 01 19:54:12 2017 +0100
@@ -66,25 +66,22 @@
 	        String oldNormalizedArabicOW = ows[2];
 	        String normalizedOW = ArabicTranslitNormalizer.normalize(ow);
 	        String normalizedArabicOW = ArabicNormalizerUtils.normalize(ow);
+	        boolean changes = false;
 	        if (normalizedOW != null && !normalizedOW.equals(oldNormalizedOW)) {
 	            System.out.println("normOW changes (#"+cnt+" @"+id+"): old="+oldNormalizedOW+" new="+normalizedOW);
+	            changes = true;
 	        }
 	        if (normalizedArabicOW != null && !normalizedArabicOW.equals(oldNormalizedArabicOW)) {
 	            System.out.println("normArabicOW changes (#"+cnt+" @"+id+"): old="+oldNormalizedArabicOW+" new="+normalizedArabicOW);
+	            changes = true;
 	        }
-	        if (modify) {
+	        if (modify && changes) {
 	            try {
 	                st.setString(1, normalizedOW);
 	                st.setString(2, normalizedArabicOW);
 	                st.setString(3, id.toString());
 	                st.executeUpdate();
 
-	                /* Statement st = conn.createStatement();
-	                st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'");
-				s = "UPDATE node SET normalized_arabic_own_value='" + normalizedArabicOW + "' WHERE row_id='"+ id +"'";
-				//System.out.println(s);
-				st.executeUpdate(s);
-	                 */
 	            } catch (SQLException e) {
 	                System.err.println(s);
 	                e.printStackTrace();
@@ -139,16 +136,6 @@
             System.exit(1);
         }
         System.exit(rc);
-        
-        //NormalizeOW.execute("all");
-		/*
-		String arg = args[0];
-		if(StringUtils.isNotEmpty(arg)){
-			if(arg.equals("ATTRIBUTE") || arg.equals("ENTITY") || arg.equals("all")){
-				NormalizeOW.execute(arg);
-				System.exit(0);
-			}
-		}*/
 	}
 
 }