Mercurial > hg > anteater
annotate src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5
checking permit numbers for similarity if no direct match
author | jdamerow |
---|---|
date | Fri, 05 Oct 2012 18:52:14 -0700 |
parents | 036535fcd179 |
children |
rev | line source |
---|---|
0 | 1 package de.mpiwg.anteater.events.processors; |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.List; | |
5 import java.util.regex.Matcher; | |
6 import java.util.regex.Pattern; | |
7 | |
1
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
8 import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler; |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
9 |
0 | 10 import de.mpiwg.anteater.events.Applicant; |
11 import de.mpiwg.anteater.events.ResearchEvent; | |
12 import de.mpiwg.anteater.ml.PlaceClasses; | |
13 import de.mpiwg.anteater.results.ApplicantResult; | |
14 import de.mpiwg.anteater.results.LocationResult; | |
15 import de.mpiwg.anteater.results.ResultsCarrier; | |
16 import de.mpiwg.anteater.results.SpeciesScientificResult; | |
17 import de.mpiwg.anteater.text.Paragraph; | |
18 import de.mpiwg.anteater.text.TextInformation; | |
19 import de.mpiwg.anteater.text.TextPart; | |
20 | |
21 public class PermitOrApplicantEventProcessor extends AEventProcessor { | |
22 | |
23 private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})"; | |
24 | |
25 @Override | |
26 public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) { | |
27 TextInformation info = carrier.getTextInfo(); | |
28 | |
29 // find how many distinct applicants there are | |
30 List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier); | |
31 | |
32 if (distinctApplicants.size() == 1) { | |
33 return; | |
34 } | |
35 | |
36 // if there are several applicants start new event with each applicant | |
37 List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>(); | |
38 List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>(); | |
39 | |
40 sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf); | |
41 | |
42 if (appsInSummary.isEmpty()) { | |
43 return; | |
44 } | |
45 | |
46 List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>(); | |
47 | |
48 List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>(); | |
49 List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>(); | |
50 sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf); | |
51 | |
52 List<LocationResult> locationsInSummary = new ArrayList<LocationResult>(); | |
53 List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>(); | |
54 sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf); | |
55 | |
56 | |
57 ResearchEvent event = null; | |
58 for (ApplicantResult appResult : appsInSummary) { | |
59 TextPart text = info.getSummaries().get( | |
60 appResult.getResult().getTextIdx()); | |
61 | |
62 int startSearchForNumberAt = appResult.getFinding().getStart() | |
63 + appResult.getFinding().getLength(); | |
64 | |
65 int endSearchForNumber = text.getText().length(); | |
66 if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1) | |
67 endSearchForNumber = appsInSummary | |
68 .get(appsInSummary.indexOf(appResult) + 1).getFinding() | |
69 .getStart(); | |
70 | |
71 String textAsString = text.getText(); | |
72 String textAfterApplicant = textAsString.substring( | |
73 startSearchForNumberAt, endSearchForNumber); | |
74 | |
75 // check if there is an application number | |
76 Pattern pattern = Pattern.compile(applicationNrPattern); | |
77 Matcher matcher = pattern.matcher(textAfterApplicant); | |
78 | |
79 List<LocationResult> locationsForApp = new ArrayList<LocationResult>(); | |
80 // find all locations between current applicant and next one | |
81 for (LocationResult locationResult : locationsInSummary) { | |
82 if (locationResult.getFinding().getStart() >= startSearchForNumberAt | |
83 && locationResult.getFinding().getStart() < endSearchForNumber) | |
84 locationsForApp.add(locationResult); | |
85 } | |
86 | |
87 List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>(); | |
88 // find all species between current applicant and next one | |
89 for (SpeciesScientificResult speciesResult : speciesInSummary) { | |
90 if (speciesResult.getFinding().getStart() > startSearchForNumberAt | |
91 && speciesResult.getFinding().getStart() < endSearchForNumber) | |
92 speciesForApp.add(speciesResult); | |
93 } | |
94 | |
95 if (event != null) { | |
96 List<Applicant> applicantsInEvent = event.getApplicants(); | |
97 List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>(); | |
98 | |
99 Applicant newApplicant = createApplicant(appResult); | |
100 event.getApplicants().add(newApplicant); | |
101 | |
102 for (Applicant applicantInEvent : applicantsInEvent) | |
103 if (applicantInEvent.getApplicantInstitution().isEmpty() | |
104 && applicantInEvent.getLocation().isEmpty()) | |
105 applicantsWithoutLoc.add(applicantInEvent); | |
106 | |
107 setLocations(applicantsWithoutLoc, locationsForApp, event); | |
108 setSpecies(speciesForApp, event); | |
109 } | |
110 | |
111 if (event == null) | |
112 event = createEvent(appResult, info, locationsForApp, | |
113 speciesForApp); | |
114 | |
115 if (matcher.find()) { | |
116 event.setApplicationOrPermitNo(matcher.group(1)); | |
117 newEvents.add(event); | |
118 event = null; | |
119 } | |
120 } | |
121 | |
122 ResearchEvent eventForPara = null; | |
123 for (TextPart text : info.getSupplInfos()) { | |
124 for (Paragraph para : text.getParagraphs()) { | |
125 int startOfPara = text.getPositionInTextFromParagraph(para, 0); | |
126 int endOfPara = startOfPara + para.getParagraphText().length(); | |
127 | |
128 Pattern pattern = Pattern.compile(applicationNrPattern); | |
129 Matcher matcher = pattern.matcher(para.getParagraphText()); | |
130 | |
131 IfStatement: if (matcher.find()) { | |
132 String numberInPara = matcher.group(1); | |
1
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
133 // check if found number exisits in events |
0 | 134 for (ResearchEvent ev : newEvents) { |
135 if (numberInPara.equals(ev.getApplicationOrPermitNo())) | |
136 { | |
137 eventForPara = ev; | |
138 break IfStatement; | |
139 } | |
140 } | |
1
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
141 // if there is no event with permit number |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
142 // check if just something was clipped |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
143 for (ResearchEvent ev : newEvents) { |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
144 if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) { |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
145 eventForPara = ev; |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
146 break IfStatement; |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
147 } |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
148 } |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
149 // if there is still no event found |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
150 // check for switched numbers |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
151 for (ResearchEvent ev : newEvents) { |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
152 JaroWinkler winkler = new JaroWinkler(); |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
153 double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo()); |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
154 if (sim > 0.85) |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
155 { |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
156 eventForPara = ev; |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
157 break IfStatement; |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
158 } |
7a4341c9f2e5
checking permit numbers for similarity if no direct match
jdamerow
parents:
0
diff
changeset
|
159 } |
0 | 160 eventForPara = null; |
161 continue; | |
162 } | |
163 | |
164 if (eventForPara == null) | |
165 continue; | |
166 | |
167 for (LocationResult loc : locationsInSuppleInf) { | |
168 if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) { | |
169 continue; | |
170 } | |
171 | |
172 if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) { | |
173 if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION) | |
174 eventForPara.getResearchLocations().add(createLocation(loc)); | |
175 } | |
176 } | |
177 | |
178 for (SpeciesScientificResult spec : speciesInSuppleInf) { | |
179 if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) { | |
180 continue; | |
181 } | |
182 | |
183 if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) { | |
184 eventForPara.getResearchedSpecies().add(createSpecies(spec)); | |
185 } | |
186 } | |
187 } | |
188 } | |
189 | |
190 events.addAll(newEvents); | |
191 } | |
192 | |
193 @Override | |
194 public int getRank() { | |
195 return 10; | |
196 } | |
197 | |
198 } |