Mercurial > hg > openmind
changeset 69:bde6212babfd
106: translit normalization should ignore all '-equivalent chars
Task-Url: https://it-dev.mpiwg-berlin.mpg.de/tracs/ismi/ticket/106
author | casties |
---|---|
date | Wed, 01 Feb 2017 19:54:12 +0100 |
parents | 677492395dc0 |
children | b5a22b9ab9c6 |
files | src/main/java/org/mpi/openmind/cache/WrapperService.java src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java src/main/java/org/mpi/openmind/scripts/NormalizeOW.java |
diffstat | 3 files changed, 17 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/src/main/java/org/mpi/openmind/cache/WrapperService.java Wed Feb 01 12:22:43 2017 +0100 +++ b/src/main/java/org/mpi/openmind/cache/WrapperService.java Wed Feb 01 19:54:12 2017 +0100 @@ -25,6 +25,7 @@ import org.mpi.openmind.repository.services.PersistenceService; import org.mpi.openmind.repository.services.utils.AttributeFilter; import org.mpi.openmind.repository.services.utils.EditIntent; +import org.mpi.openmind.repository.utils.ArabicTranslitNormalizer; import org.mpi.openmind.repository.utils.ImportOM3Util; import org.mpi.openmind.repository.utils.NormalizerUtils; import org.mpi.openmind.repository.utils.RomanizationLoC; @@ -151,26 +152,23 @@ txLog.debug("** END remove entity: user="+user+" entity="+entity.toSmallString()); } - public Map<Entity, Attribute> searchEntityByAttributeFilter0(String term, - List<AttributeFilter> filters, int maxResults) { + public Map<Entity, Attribute> searchEntityByAttributeFilter(String term, List<AttributeFilter> filters, + int maxResults) { Map<Entity, Attribute> map = new HashMap<Entity, Attribute>(); List<Long> usedIds = new ArrayList<Long>(); boolean mustBreak = false; int count = 0; if (StringUtils.isNotEmpty(term)) { - String normalizedTerm = NormalizerUtils.normalize(term); + // TODO: better normalization + String normalizedTerm = ArabicTranslitNormalizer.normalize(term); for (AttributeFilter filter : filters) { if (mustBreak) { break; } - for (Attribute att : getAttributesByDefByAttName( - filter.getEntObjectClass(), filter.getName(), -1)) { - if (!usedIds.contains(att.getSourceId()) - && StringUtils.isNotEmpty(att - .getNormalizedOwnValue()) - && att.getNormalizedOwnValue().contains( - normalizedTerm)) { + for (Attribute att : getAttributesByDefByAttName(filter.getEntObjectClass(), filter.getName(), -1)) { + if (!usedIds.contains(att.getSourceId()) && StringUtils.isNotEmpty(att.getNormalizedOwnValue()) + && att.getNormalizedOwnValue().contains(normalizedTerm)) { map.put(getEntityById(att.getSourceId()), att); usedIds.add(att.getSourceId()); count++;
--- a/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Wed Feb 01 12:22:43 2017 +0100 +++ b/src/main/java/org/mpi/openmind/repository/utils/ArabicTranslitNormalizer.java Wed Feb 01 19:54:12 2017 +0100 @@ -18,7 +18,9 @@ protected static Map<String, Pattern> apostrophePatterns = new HashMap<String, Pattern>(); static { // `, ʿ, ʾ, ‘, ’ -> ' - apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019")); + //apostrophePatterns.put("'", Pattern.compile("\u0060|\u02BE|\u02BF|\u2018|\u2019")); + // remove `, ʿ, ʾ, ‘, ’, ' + apostrophePatterns.put("", Pattern.compile("'|\u0060|\u02BE|\u02BF|\u2018|\u2019")); } protected static Map<String, Pattern> twoletterPatterns = new HashMap<String, Pattern>(); @@ -37,7 +39,8 @@ // aẗ\b, at\b, ah\b -> a wordpartPatterns.put("a", Pattern.compile("a\u1E97\\b|at\\b|ah\\b")); // 'abd + space -> 'abd - wordpartPatterns.put("'abd", Pattern.compile("'abd ")); + //wordpartPatterns.put("'abd", Pattern.compile("'abd ")); + wordpartPatterns.put("abd", Pattern.compile("abd ")); } protected static Map<String, Pattern> letterdiacritPatterns = new HashMap<String, Pattern>();
--- a/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Wed Feb 01 12:22:43 2017 +0100 +++ b/src/main/java/org/mpi/openmind/scripts/NormalizeOW.java Wed Feb 01 19:54:12 2017 +0100 @@ -66,25 +66,22 @@ String oldNormalizedArabicOW = ows[2]; String normalizedOW = ArabicTranslitNormalizer.normalize(ow); String normalizedArabicOW = ArabicNormalizerUtils.normalize(ow); + boolean changes = false; if (normalizedOW != null && !normalizedOW.equals(oldNormalizedOW)) { System.out.println("normOW changes (#"+cnt+" @"+id+"): old="+oldNormalizedOW+" new="+normalizedOW); + changes = true; } if (normalizedArabicOW != null && !normalizedArabicOW.equals(oldNormalizedArabicOW)) { System.out.println("normArabicOW changes (#"+cnt+" @"+id+"): old="+oldNormalizedArabicOW+" new="+normalizedArabicOW); + changes = true; } - if (modify) { + if (modify && changes) { try { st.setString(1, normalizedOW); st.setString(2, normalizedArabicOW); st.setString(3, id.toString()); st.executeUpdate(); - /* Statement st = conn.createStatement(); - st.executeUpdate("UPDATE node SET normalized_own_value='" + normalizedOW + "' WHERE row_id='"+ id +"'"); - s = "UPDATE node SET normalized_arabic_own_value='" + normalizedArabicOW + "' WHERE row_id='"+ id +"'"; - //System.out.println(s); - st.executeUpdate(s); - */ } catch (SQLException e) { System.err.println(s); e.printStackTrace(); @@ -139,16 +136,6 @@ System.exit(1); } System.exit(rc); - - //NormalizeOW.execute("all"); - /* - String arg = args[0]; - if(StringUtils.isNotEmpty(arg)){ - if(arg.equals("ATTRIBUTE") || arg.equals("ENTITY") || arg.equals("all")){ - NormalizeOW.execute(arg); - System.exit(0); - } - }*/ } }