Mercurial > hg > anteater
diff src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children | 7a4341c9f2e5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,176 @@ +package de.mpiwg.anteater.events.processors; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.mpiwg.anteater.events.Applicant; +import de.mpiwg.anteater.events.ResearchEvent; +import de.mpiwg.anteater.ml.PlaceClasses; +import de.mpiwg.anteater.results.ApplicantResult; +import de.mpiwg.anteater.results.LocationResult; +import de.mpiwg.anteater.results.ResultsCarrier; +import de.mpiwg.anteater.results.SpeciesScientificResult; +import de.mpiwg.anteater.text.Paragraph; +import de.mpiwg.anteater.text.TextInformation; +import de.mpiwg.anteater.text.TextPart; + +public class PermitOrApplicantEventProcessor extends AEventProcessor { + + private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})"; + + @Override + public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) { + TextInformation info = carrier.getTextInfo(); + + // find how many distinct applicants there are + List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier); + + if (distinctApplicants.size() == 1) { + return; + } + + // if there are several applicants start new event with each applicant + List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>(); + List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>(); + + sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf); + + if (appsInSummary.isEmpty()) { + return; + } + + List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>(); + + List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>(); + List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>(); + sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf); + + List<LocationResult> locationsInSummary = new ArrayList<LocationResult>(); + List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>(); + sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf); + + + ResearchEvent event = null; + for (ApplicantResult appResult : appsInSummary) { + TextPart text = info.getSummaries().get( + appResult.getResult().getTextIdx()); + + int startSearchForNumberAt = appResult.getFinding().getStart() + + appResult.getFinding().getLength(); + + int endSearchForNumber = text.getText().length(); + if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1) + endSearchForNumber = appsInSummary + .get(appsInSummary.indexOf(appResult) + 1).getFinding() + .getStart(); + + String textAsString = text.getText(); + String textAfterApplicant = textAsString.substring( + startSearchForNumberAt, endSearchForNumber); + + // check if there is an application number + Pattern pattern = Pattern.compile(applicationNrPattern); + Matcher matcher = pattern.matcher(textAfterApplicant); + + List<LocationResult> locationsForApp = new ArrayList<LocationResult>(); + // find all locations between current applicant and next one + for (LocationResult locationResult : locationsInSummary) { + if (locationResult.getFinding().getStart() >= startSearchForNumberAt + && locationResult.getFinding().getStart() < endSearchForNumber) + locationsForApp.add(locationResult); + } + + List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>(); + // find all species between current applicant and next one + for (SpeciesScientificResult speciesResult : speciesInSummary) { + if (speciesResult.getFinding().getStart() > startSearchForNumberAt + && speciesResult.getFinding().getStart() < endSearchForNumber) + speciesForApp.add(speciesResult); + } + + if (event != null) { + List<Applicant> applicantsInEvent = event.getApplicants(); + List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>(); + + Applicant newApplicant = createApplicant(appResult); + event.getApplicants().add(newApplicant); + + for (Applicant applicantInEvent : applicantsInEvent) + if (applicantInEvent.getApplicantInstitution().isEmpty() + && applicantInEvent.getLocation().isEmpty()) + applicantsWithoutLoc.add(applicantInEvent); + + setLocations(applicantsWithoutLoc, locationsForApp, event); + setSpecies(speciesForApp, event); + } + + if (event == null) + event = createEvent(appResult, info, locationsForApp, + speciesForApp); + + if (matcher.find()) { + event.setApplicationOrPermitNo(matcher.group(1)); + newEvents.add(event); + event = null; + } + } + + ResearchEvent eventForPara = null; + for (TextPart text : info.getSupplInfos()) { + for (Paragraph para : text.getParagraphs()) { + int startOfPara = text.getPositionInTextFromParagraph(para, 0); + int endOfPara = startOfPara + para.getParagraphText().length(); + + Pattern pattern = Pattern.compile(applicationNrPattern); + Matcher matcher = pattern.matcher(para.getParagraphText()); + + IfStatement: if (matcher.find()) { + String numberInPara = matcher.group(1); + for (ResearchEvent ev : newEvents) { + if (numberInPara.equals(ev.getApplicationOrPermitNo())) + { + eventForPara = ev; + break IfStatement; + } + } + eventForPara = null; + continue; + } + + if (eventForPara == null) + continue; + + for (LocationResult loc : locationsInSuppleInf) { + if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) { + continue; + } + + if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) { + if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION) + eventForPara.getResearchLocations().add(createLocation(loc)); + } + } + + for (SpeciesScientificResult spec : speciesInSuppleInf) { + if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) { + continue; + } + + if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) { + eventForPara.getResearchedSpecies().add(createSpecies(spec)); + } + } + } + } + + events.addAll(newEvents); + } + + @Override + public int getRank() { + return 10; + } + +}