annotate src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children 7a4341c9f2e5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.events.processors;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.regex.Matcher;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.util.regex.Pattern;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import de.mpiwg.anteater.events.Applicant;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import de.mpiwg.anteater.events.ResearchEvent;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import de.mpiwg.anteater.ml.PlaceClasses;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.results.ApplicantResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import de.mpiwg.anteater.results.LocationResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.results.ResultsCarrier;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import de.mpiwg.anteater.results.SpeciesScientificResult;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import de.mpiwg.anteater.text.Paragraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 import de.mpiwg.anteater.text.TextInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 import de.mpiwg.anteater.text.TextPart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 public class PermitOrApplicantEventProcessor extends AEventProcessor {
036535fcd179 anteater
jdamerow
parents:
diff changeset
20
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})";
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 TextInformation info = carrier.getTextInfo();
036535fcd179 anteater
jdamerow
parents:
diff changeset
26
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 // find how many distinct applicants there are
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier);
036535fcd179 anteater
jdamerow
parents:
diff changeset
29
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 if (distinctApplicants.size() == 1) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 return;
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
33
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 // if there are several applicants start new event with each applicant
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
37
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
39
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 if (appsInSummary.isEmpty()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 return;
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
43
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
45
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
49
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 List<LocationResult> locationsInSummary = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf);
036535fcd179 anteater
jdamerow
parents:
diff changeset
53
036535fcd179 anteater
jdamerow
parents:
diff changeset
54
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 ResearchEvent event = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 for (ApplicantResult appResult : appsInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 TextPart text = info.getSummaries().get(
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 appResult.getResult().getTextIdx());
036535fcd179 anteater
jdamerow
parents:
diff changeset
59
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 int startSearchForNumberAt = appResult.getFinding().getStart()
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 + appResult.getFinding().getLength();
036535fcd179 anteater
jdamerow
parents:
diff changeset
62
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 int endSearchForNumber = text.getText().length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
64 if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1)
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 endSearchForNumber = appsInSummary
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 .get(appsInSummary.indexOf(appResult) + 1).getFinding()
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 .getStart();
036535fcd179 anteater
jdamerow
parents:
diff changeset
68
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 String textAsString = text.getText();
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 String textAfterApplicant = textAsString.substring(
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 startSearchForNumberAt, endSearchForNumber);
036535fcd179 anteater
jdamerow
parents:
diff changeset
72
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 // check if there is an application number
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 Pattern pattern = Pattern.compile(applicationNrPattern);
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 Matcher matcher = pattern.matcher(textAfterApplicant);
036535fcd179 anteater
jdamerow
parents:
diff changeset
76
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 List<LocationResult> locationsForApp = new ArrayList<LocationResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
78 // find all locations between current applicant and next one
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 for (LocationResult locationResult : locationsInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 if (locationResult.getFinding().getStart() >= startSearchForNumberAt
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 && locationResult.getFinding().getStart() < endSearchForNumber)
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 locationsForApp.add(locationResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
84
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
86 // find all species between current applicant and next one
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 for (SpeciesScientificResult speciesResult : speciesInSummary) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 if (speciesResult.getFinding().getStart() > startSearchForNumberAt
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 && speciesResult.getFinding().getStart() < endSearchForNumber)
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 speciesForApp.add(speciesResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
92
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 if (event != null) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
94 List<Applicant> applicantsInEvent = event.getApplicants();
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
96
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 Applicant newApplicant = createApplicant(appResult);
036535fcd179 anteater
jdamerow
parents:
diff changeset
98 event.getApplicants().add(newApplicant);
036535fcd179 anteater
jdamerow
parents:
diff changeset
99
036535fcd179 anteater
jdamerow
parents:
diff changeset
100 for (Applicant applicantInEvent : applicantsInEvent)
036535fcd179 anteater
jdamerow
parents:
diff changeset
101 if (applicantInEvent.getApplicantInstitution().isEmpty()
036535fcd179 anteater
jdamerow
parents:
diff changeset
102 && applicantInEvent.getLocation().isEmpty())
036535fcd179 anteater
jdamerow
parents:
diff changeset
103 applicantsWithoutLoc.add(applicantInEvent);
036535fcd179 anteater
jdamerow
parents:
diff changeset
104
036535fcd179 anteater
jdamerow
parents:
diff changeset
105 setLocations(applicantsWithoutLoc, locationsForApp, event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
106 setSpecies(speciesForApp, event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
107 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
108
036535fcd179 anteater
jdamerow
parents:
diff changeset
109 if (event == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
110 event = createEvent(appResult, info, locationsForApp,
036535fcd179 anteater
jdamerow
parents:
diff changeset
111 speciesForApp);
036535fcd179 anteater
jdamerow
parents:
diff changeset
112
036535fcd179 anteater
jdamerow
parents:
diff changeset
113 if (matcher.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
114 event.setApplicationOrPermitNo(matcher.group(1));
036535fcd179 anteater
jdamerow
parents:
diff changeset
115 newEvents.add(event);
036535fcd179 anteater
jdamerow
parents:
diff changeset
116 event = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
117 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
118 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
119
036535fcd179 anteater
jdamerow
parents:
diff changeset
120 ResearchEvent eventForPara = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
121 for (TextPart text : info.getSupplInfos()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
122 for (Paragraph para : text.getParagraphs()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
123 int startOfPara = text.getPositionInTextFromParagraph(para, 0);
036535fcd179 anteater
jdamerow
parents:
diff changeset
124 int endOfPara = startOfPara + para.getParagraphText().length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
125
036535fcd179 anteater
jdamerow
parents:
diff changeset
126 Pattern pattern = Pattern.compile(applicationNrPattern);
036535fcd179 anteater
jdamerow
parents:
diff changeset
127 Matcher matcher = pattern.matcher(para.getParagraphText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
128
036535fcd179 anteater
jdamerow
parents:
diff changeset
129 IfStatement: if (matcher.find()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
130 String numberInPara = matcher.group(1);
036535fcd179 anteater
jdamerow
parents:
diff changeset
131 for (ResearchEvent ev : newEvents) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
132 if (numberInPara.equals(ev.getApplicationOrPermitNo()))
036535fcd179 anteater
jdamerow
parents:
diff changeset
133 {
036535fcd179 anteater
jdamerow
parents:
diff changeset
134 eventForPara = ev;
036535fcd179 anteater
jdamerow
parents:
diff changeset
135 break IfStatement;
036535fcd179 anteater
jdamerow
parents:
diff changeset
136 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
137 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
138 eventForPara = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
139 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
140 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
141
036535fcd179 anteater
jdamerow
parents:
diff changeset
142 if (eventForPara == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
143 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
144
036535fcd179 anteater
jdamerow
parents:
diff changeset
145 for (LocationResult loc : locationsInSuppleInf) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
146 if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
147 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
148 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
149
036535fcd179 anteater
jdamerow
parents:
diff changeset
150 if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
151 if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION)
036535fcd179 anteater
jdamerow
parents:
diff changeset
152 eventForPara.getResearchLocations().add(createLocation(loc));
036535fcd179 anteater
jdamerow
parents:
diff changeset
153 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
154 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
155
036535fcd179 anteater
jdamerow
parents:
diff changeset
156 for (SpeciesScientificResult spec : speciesInSuppleInf) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
157 if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
158 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
159 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
160
036535fcd179 anteater
jdamerow
parents:
diff changeset
161 if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
162 eventForPara.getResearchedSpecies().add(createSpecies(spec));
036535fcd179 anteater
jdamerow
parents:
diff changeset
163 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
164 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
165 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
166 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
167
036535fcd179 anteater
jdamerow
parents:
diff changeset
168 events.addAll(newEvents);
036535fcd179 anteater
jdamerow
parents:
diff changeset
169 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
170
036535fcd179 anteater
jdamerow
parents:
diff changeset
171 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
172 public int getRank() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
173 return 10;
036535fcd179 anteater
jdamerow
parents:
diff changeset
174 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
175
036535fcd179 anteater
jdamerow
parents:
diff changeset
176 }