comparison src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5

checking permit numbers for similarity if no direct match
author jdamerow
date Fri, 05 Oct 2012 18:52:14 -0700
parents 036535fcd179
children
comparison
equal deleted inserted replaced
0:036535fcd179 1:7a4341c9f2e5
2 2
3 import java.util.ArrayList; 3 import java.util.ArrayList;
4 import java.util.List; 4 import java.util.List;
5 import java.util.regex.Matcher; 5 import java.util.regex.Matcher;
6 import java.util.regex.Pattern; 6 import java.util.regex.Pattern;
7
8 import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;
7 9
8 import de.mpiwg.anteater.events.Applicant; 10 import de.mpiwg.anteater.events.Applicant;
9 import de.mpiwg.anteater.events.ResearchEvent; 11 import de.mpiwg.anteater.events.ResearchEvent;
10 import de.mpiwg.anteater.ml.PlaceClasses; 12 import de.mpiwg.anteater.ml.PlaceClasses;
11 import de.mpiwg.anteater.results.ApplicantResult; 13 import de.mpiwg.anteater.results.ApplicantResult;
126 Pattern pattern = Pattern.compile(applicationNrPattern); 128 Pattern pattern = Pattern.compile(applicationNrPattern);
127 Matcher matcher = pattern.matcher(para.getParagraphText()); 129 Matcher matcher = pattern.matcher(para.getParagraphText());
128 130
129 IfStatement: if (matcher.find()) { 131 IfStatement: if (matcher.find()) {
130 String numberInPara = matcher.group(1); 132 String numberInPara = matcher.group(1);
133 // check if found number exisits in events
131 for (ResearchEvent ev : newEvents) { 134 for (ResearchEvent ev : newEvents) {
132 if (numberInPara.equals(ev.getApplicationOrPermitNo())) 135 if (numberInPara.equals(ev.getApplicationOrPermitNo()))
136 {
137 eventForPara = ev;
138 break IfStatement;
139 }
140 }
141 // if there is no event with permit number
142 // check if just something was clipped
143 for (ResearchEvent ev : newEvents) {
144 if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) {
145 eventForPara = ev;
146 break IfStatement;
147 }
148 }
149 // if there is still no event found
150 // check for switched numbers
151 for (ResearchEvent ev : newEvents) {
152 JaroWinkler winkler = new JaroWinkler();
153 double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo());
154 if (sim > 0.85)
133 { 155 {
134 eventForPara = ev; 156 eventForPara = ev;
135 break IfStatement; 157 break IfStatement;
136 } 158 }
137 } 159 }