diff src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5

checking permit numbers for similarity if no direct match
author jdamerow
date Fri, 05 Oct 2012 18:52:14 -0700
parents 036535fcd179
children
line wrap: on
line diff
--- a/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java	Fri Sep 14 10:30:43 2012 +0200
+++ b/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java	Fri Oct 05 18:52:14 2012 -0700
@@ -5,6 +5,8 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;
+
 import de.mpiwg.anteater.events.Applicant;
 import de.mpiwg.anteater.events.ResearchEvent;
 import de.mpiwg.anteater.ml.PlaceClasses;
@@ -128,6 +130,7 @@
 
 				IfStatement: if (matcher.find()) {
 					String numberInPara = matcher.group(1);
+					// check if found number exisits in events
 					for (ResearchEvent ev : newEvents) {
 						if (numberInPara.equals(ev.getApplicationOrPermitNo()))
 						{
@@ -135,6 +138,25 @@
 							break IfStatement;
 						}
 					}
+					// if there is no event with permit number
+					// check if just something was clipped
+					for (ResearchEvent ev : newEvents) {
+						if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) {
+							eventForPara = ev;
+							break IfStatement;
+						}
+					}
+					// if there is still no event found
+					// check for switched numbers
+					for (ResearchEvent ev : newEvents) {
+						JaroWinkler winkler = new JaroWinkler();
+						double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo());
+						if (sim > 0.85)
+						{
+							eventForPara = ev;
+							break IfStatement;
+						}
+					}
 					eventForPara = null;
 					continue;
 				}