Mercurial > hg > anteater
view src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5
checking permit numbers for similarity if no direct match
author | jdamerow |
---|---|
date | Fri, 05 Oct 2012 18:52:14 -0700 |
parents | 036535fcd179 |
children |
line wrap: on
line source
package de.mpiwg.anteater.events.processors; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler; import de.mpiwg.anteater.events.Applicant; import de.mpiwg.anteater.events.ResearchEvent; import de.mpiwg.anteater.ml.PlaceClasses; import de.mpiwg.anteater.results.ApplicantResult; import de.mpiwg.anteater.results.LocationResult; import de.mpiwg.anteater.results.ResultsCarrier; import de.mpiwg.anteater.results.SpeciesScientificResult; import de.mpiwg.anteater.text.Paragraph; import de.mpiwg.anteater.text.TextInformation; import de.mpiwg.anteater.text.TextPart; public class PermitOrApplicantEventProcessor extends AEventProcessor { private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})"; @Override public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) { TextInformation info = carrier.getTextInfo(); // find how many distinct applicants there are List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier); if (distinctApplicants.size() == 1) { return; } // if there are several applicants start new event with each applicant List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>(); List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>(); sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf); if (appsInSummary.isEmpty()) { return; } List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>(); List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>(); List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>(); sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf); List<LocationResult> locationsInSummary = new ArrayList<LocationResult>(); List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>(); sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf); ResearchEvent event = null; for (ApplicantResult appResult : appsInSummary) { TextPart text = info.getSummaries().get( appResult.getResult().getTextIdx()); int startSearchForNumberAt = appResult.getFinding().getStart() + appResult.getFinding().getLength(); int endSearchForNumber = text.getText().length(); if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1) endSearchForNumber = appsInSummary .get(appsInSummary.indexOf(appResult) + 1).getFinding() .getStart(); String textAsString = text.getText(); String textAfterApplicant = textAsString.substring( startSearchForNumberAt, endSearchForNumber); // check if there is an application number Pattern pattern = Pattern.compile(applicationNrPattern); Matcher matcher = pattern.matcher(textAfterApplicant); List<LocationResult> locationsForApp = new ArrayList<LocationResult>(); // find all locations between current applicant and next one for (LocationResult locationResult : locationsInSummary) { if (locationResult.getFinding().getStart() >= startSearchForNumberAt && locationResult.getFinding().getStart() < endSearchForNumber) locationsForApp.add(locationResult); } List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>(); // find all species between current applicant and next one for (SpeciesScientificResult speciesResult : speciesInSummary) { if (speciesResult.getFinding().getStart() > startSearchForNumberAt && speciesResult.getFinding().getStart() < endSearchForNumber) speciesForApp.add(speciesResult); } if (event != null) { List<Applicant> applicantsInEvent = event.getApplicants(); List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>(); Applicant newApplicant = createApplicant(appResult); event.getApplicants().add(newApplicant); for (Applicant applicantInEvent : applicantsInEvent) if (applicantInEvent.getApplicantInstitution().isEmpty() && applicantInEvent.getLocation().isEmpty()) applicantsWithoutLoc.add(applicantInEvent); setLocations(applicantsWithoutLoc, locationsForApp, event); setSpecies(speciesForApp, event); } if (event == null) event = createEvent(appResult, info, locationsForApp, speciesForApp); if (matcher.find()) { event.setApplicationOrPermitNo(matcher.group(1)); newEvents.add(event); event = null; } } ResearchEvent eventForPara = null; for (TextPart text : info.getSupplInfos()) { for (Paragraph para : text.getParagraphs()) { int startOfPara = text.getPositionInTextFromParagraph(para, 0); int endOfPara = startOfPara + para.getParagraphText().length(); Pattern pattern = Pattern.compile(applicationNrPattern); Matcher matcher = pattern.matcher(para.getParagraphText()); IfStatement: if (matcher.find()) { String numberInPara = matcher.group(1); // check if found number exisits in events for (ResearchEvent ev : newEvents) { if (numberInPara.equals(ev.getApplicationOrPermitNo())) { eventForPara = ev; break IfStatement; } } // if there is no event with permit number // check if just something was clipped for (ResearchEvent ev : newEvents) { if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) { eventForPara = ev; break IfStatement; } } // if there is still no event found // check for switched numbers for (ResearchEvent ev : newEvents) { JaroWinkler winkler = new JaroWinkler(); double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo()); if (sim > 0.85) { eventForPara = ev; break IfStatement; } } eventForPara = null; continue; } if (eventForPara == null) continue; for (LocationResult loc : locationsInSuppleInf) { if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) { continue; } if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) { if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION) eventForPara.getResearchLocations().add(createLocation(loc)); } } for (SpeciesScientificResult spec : speciesInSuppleInf) { if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) { continue; } if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) { eventForPara.getResearchedSpecies().add(createSpecies(spec)); } } } } events.addAll(newEvents); } @Override public int getRank() { return 10; } }