annotate src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5

checking permit numbers for similarity if no direct match
author jdamerow
date Fri, 05 Oct 2012 18:52:14 -0700
parents 036535fcd179
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.events.processors;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.regex.Matcher;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.util.regex.Pattern;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7
1
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
8 import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
9
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import de.mpiwg.anteater.events.Applicant;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.events.ResearchEvent;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import de.mpiwg.anteater.ml.PlaceClasses;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.results.ApplicantResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import de.mpiwg.anteater.results.LocationResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import de.mpiwg.anteater.results.ResultsCarrier;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 import de.mpiwg.anteater.results.SpeciesScientificResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 import de.mpiwg.anteater.text.Paragraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 import de.mpiwg.anteater.text.TextInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 import de.mpiwg.anteater.text.TextPart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
20
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 public class PermitOrApplicantEventProcessor extends AEventProcessor {
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})";
036535fcd179 anteater
jdamerow
parents:
diff changeset
24
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 TextInformation info = carrier.getTextInfo();
036535fcd179 anteater
jdamerow
parents:
diff changeset
28
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 // find how many distinct applicants there are
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier);
036535fcd179 anteater
jdamerow
parents:
diff changeset
31
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 if (distinctApplicants.size() == 1) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 return;
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
35
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 // if there are several applicants start new event with each applicant
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
41
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 if (appsInSummary.isEmpty()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 return;
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
45
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
47
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
51
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 List<LocationResult> locationsInSummary = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
55
036535fcd179 anteater
jdamerow
parents:
diff changeset
56
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 ResearchEvent event = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 for (ApplicantResult appResult : appsInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 TextPart text = info.getSummaries().get(
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 appResult.getResult().getTextIdx());
036535fcd179 anteater
jdamerow
parents:
diff changeset
61
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 int startSearchForNumberAt = appResult.getFinding().getStart()
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 + appResult.getFinding().getLength();
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 int endSearchForNumber = text.getText().length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1)
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 endSearchForNumber = appsInSummary
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 .get(appsInSummary.indexOf(appResult) + 1).getFinding()
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 .getStart();
036535fcd179 anteater
jdamerow
parents:
diff changeset
70
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 String textAsString = text.getText();
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 String textAfterApplicant = textAsString.substring(
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 startSearchForNumberAt, endSearchForNumber);
036535fcd179 anteater
jdamerow
parents:
diff changeset
74
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 // check if there is an application number
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 Pattern pattern = Pattern.compile(applicationNrPattern);
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 Matcher matcher = pattern.matcher(textAfterApplicant);
036535fcd179 anteater
jdamerow
parents:
diff changeset
78
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 List<LocationResult> locationsForApp = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 // find all locations between current applicant and next one
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 for (LocationResult locationResult : locationsInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 if (locationResult.getFinding().getStart() >= startSearchForNumberAt
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 && locationResult.getFinding().getStart() < endSearchForNumber)
036535fcd179 anteater
jdamerow
parents:
diff changeset
84 locationsForApp.add(locationResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
86
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 // find all species between current applicant and next one
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 for (SpeciesScientificResult speciesResult : speciesInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 if (speciesResult.getFinding().getStart() > startSearchForNumberAt
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 && speciesResult.getFinding().getStart() < endSearchForNumber)
036535fcd179 anteater
jdamerow
parents:
diff changeset
92 speciesForApp.add(speciesResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
94
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 if (event != null) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 List<Applicant> applicantsInEvent = event.getApplicants();
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
98
036535fcd179 anteater
jdamerow
parents:
diff changeset
99 Applicant newApplicant = createApplicant(appResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
100 event.getApplicants().add(newApplicant);
036535fcd179 anteater
jdamerow
parents:
diff changeset
101
036535fcd179 anteater
jdamerow
parents:
diff changeset
102 for (Applicant applicantInEvent : applicantsInEvent)
036535fcd179 anteater
jdamerow
parents:
diff changeset
103 if (applicantInEvent.getApplicantInstitution().isEmpty()
036535fcd179 anteater
jdamerow
parents:
diff changeset
104 && applicantInEvent.getLocation().isEmpty())
036535fcd179 anteater
jdamerow
parents:
diff changeset
105 applicantsWithoutLoc.add(applicantInEvent);
036535fcd179 anteater
jdamerow
parents:
diff changeset
106
036535fcd179 anteater
jdamerow
parents:
diff changeset
107 setLocations(applicantsWithoutLoc, locationsForApp, event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
108 setSpecies(speciesForApp, event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
109 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
110
036535fcd179 anteater
jdamerow
parents:
diff changeset
111 if (event == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
112 event = createEvent(appResult, info, locationsForApp,
036535fcd179 anteater
jdamerow
parents:
diff changeset
113 speciesForApp);
036535fcd179 anteater
jdamerow
parents:
diff changeset
114
036535fcd179 anteater
jdamerow
parents:
diff changeset
115 if (matcher.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
116 event.setApplicationOrPermitNo(matcher.group(1));
036535fcd179 anteater
jdamerow
parents:
diff changeset
117 newEvents.add(event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
118 event = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
119 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
120 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
121
036535fcd179 anteater
jdamerow
parents:
diff changeset
122 ResearchEvent eventForPara = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
123 for (TextPart text : info.getSupplInfos()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
124 for (Paragraph para : text.getParagraphs()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
125 int startOfPara = text.getPositionInTextFromParagraph(para, 0);
036535fcd179 anteater
jdamerow
parents:
diff changeset
126 int endOfPara = startOfPara + para.getParagraphText().length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
127
036535fcd179 anteater
jdamerow
parents:
diff changeset
128 Pattern pattern = Pattern.compile(applicationNrPattern);
036535fcd179 anteater
jdamerow
parents:
diff changeset
129 Matcher matcher = pattern.matcher(para.getParagraphText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
130
036535fcd179 anteater
jdamerow
parents:
diff changeset
131 IfStatement: if (matcher.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
132 String numberInPara = matcher.group(1);
1
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
133 // check if found number exisits in events
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
134 for (ResearchEvent ev : newEvents) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
135 if (numberInPara.equals(ev.getApplicationOrPermitNo()))
036535fcd179 anteater
jdamerow
parents:
diff changeset
136 {
036535fcd179 anteater
jdamerow
parents:
diff changeset
137 eventForPara = ev;
036535fcd179 anteater
jdamerow
parents:
diff changeset
138 break IfStatement;
036535fcd179 anteater
jdamerow
parents:
diff changeset
139 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
140 }
1
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
141 // if there is no event with permit number
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
142 // check if just something was clipped
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
143 for (ResearchEvent ev : newEvents) {
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
144 if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) {
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
145 eventForPara = ev;
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
146 break IfStatement;
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
147 }
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
148 }
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
149 // if there is still no event found
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
150 // check for switched numbers
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
151 for (ResearchEvent ev : newEvents) {
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
152 JaroWinkler winkler = new JaroWinkler();
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
153 double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo());
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
154 if (sim > 0.85)
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
155 {
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
156 eventForPara = ev;
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
157 break IfStatement;
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
158 }
7a4341c9f2e5 checking permit numbers for similarity if no direct match
jdamerow
parents: 0
diff changeset
159 }
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
160 eventForPara = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
161 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
162 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
163
036535fcd179 anteater
jdamerow
parents:
diff changeset
164 if (eventForPara == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
165 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
166
036535fcd179 anteater
jdamerow
parents:
diff changeset
167 for (LocationResult loc : locationsInSuppleInf) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
168 if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
169 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
170 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
171
036535fcd179 anteater
jdamerow
parents:
diff changeset
172 if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
173 if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION)
036535fcd179 anteater
jdamerow
parents:
diff changeset
174 eventForPara.getResearchLocations().add(createLocation(loc));
036535fcd179 anteater
jdamerow
parents:
diff changeset
175 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
176 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
177
036535fcd179 anteater
jdamerow
parents:
diff changeset
178 for (SpeciesScientificResult spec : speciesInSuppleInf) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
179 if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
180 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
181 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
182
036535fcd179 anteater
jdamerow
parents:
diff changeset
183 if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
184 eventForPara.getResearchedSpecies().add(createSpecies(spec));
036535fcd179 anteater
jdamerow
parents:
diff changeset
185 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
186 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
187 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
188 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
189
036535fcd179 anteater
jdamerow
parents:
diff changeset
190 events.addAll(newEvents);
036535fcd179 anteater
jdamerow
parents:
diff changeset
191 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
192
036535fcd179 anteater
jdamerow
parents:
diff changeset
193 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
194 public int getRank() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
195 return 10;
036535fcd179 anteater
jdamerow
parents:
diff changeset
196 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
197
036535fcd179 anteater
jdamerow
parents:
diff changeset
198 }