diff src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children 7a4341c9f2e5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,176 @@
+package de.mpiwg.anteater.events.processors;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.mpiwg.anteater.events.Applicant;
+import de.mpiwg.anteater.events.ResearchEvent;
+import de.mpiwg.anteater.ml.PlaceClasses;
+import de.mpiwg.anteater.results.ApplicantResult;
+import de.mpiwg.anteater.results.LocationResult;
+import de.mpiwg.anteater.results.ResultsCarrier;
+import de.mpiwg.anteater.results.SpeciesScientificResult;
+import de.mpiwg.anteater.text.Paragraph;
+import de.mpiwg.anteater.text.TextInformation;
+import de.mpiwg.anteater.text.TextPart;
+
+public class PermitOrApplicantEventProcessor extends AEventProcessor {
+
+	private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})";
+
+	@Override
+	public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) {
+		TextInformation info = carrier.getTextInfo();
+
+		// find how many distinct applicants there are
+		List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier);
+
+		if (distinctApplicants.size() == 1) {
+			return;
+		}
+
+		// if there are several applicants start new event with each applicant
+		List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>();
+		List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>();
+
+		sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf);
+
+		if (appsInSummary.isEmpty()) {
+			return;
+		}
+
+		List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>();
+		
+		List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>();
+		List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>();
+		sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf);
+		
+		List<LocationResult> locationsInSummary = new ArrayList<LocationResult>();
+		List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>();
+		sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf);
+
+
+		ResearchEvent event = null;
+		for (ApplicantResult appResult : appsInSummary) {
+			TextPart text = info.getSummaries().get(
+					appResult.getResult().getTextIdx());
+
+			int startSearchForNumberAt = appResult.getFinding().getStart()
+					+ appResult.getFinding().getLength();
+
+			int endSearchForNumber = text.getText().length();
+			if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1)
+				endSearchForNumber = appsInSummary
+						.get(appsInSummary.indexOf(appResult) + 1).getFinding()
+						.getStart();
+
+			String textAsString = text.getText();
+			String textAfterApplicant = textAsString.substring(
+					startSearchForNumberAt, endSearchForNumber);
+
+			// check if there is an application number
+			Pattern pattern = Pattern.compile(applicationNrPattern);
+			Matcher matcher = pattern.matcher(textAfterApplicant);
+
+			List<LocationResult> locationsForApp = new ArrayList<LocationResult>();
+			// find all locations between current applicant and next one
+			for (LocationResult locationResult : locationsInSummary) {
+				if (locationResult.getFinding().getStart() >= startSearchForNumberAt
+						&& locationResult.getFinding().getStart() < endSearchForNumber)
+					locationsForApp.add(locationResult);
+			}
+
+			List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>();
+			// find all species between current applicant and next one
+			for (SpeciesScientificResult speciesResult : speciesInSummary) {
+				if (speciesResult.getFinding().getStart() > startSearchForNumberAt
+						&& speciesResult.getFinding().getStart() < endSearchForNumber)
+					speciesForApp.add(speciesResult);
+			}
+
+			if (event != null) {
+				List<Applicant> applicantsInEvent = event.getApplicants();
+				List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>();
+
+				Applicant newApplicant = createApplicant(appResult);
+				event.getApplicants().add(newApplicant);
+
+				for (Applicant applicantInEvent : applicantsInEvent)
+					if (applicantInEvent.getApplicantInstitution().isEmpty()
+							&& applicantInEvent.getLocation().isEmpty())
+						applicantsWithoutLoc.add(applicantInEvent);
+
+				setLocations(applicantsWithoutLoc, locationsForApp, event);
+				setSpecies(speciesForApp, event);
+			}
+
+			if (event == null)
+				event = createEvent(appResult, info, locationsForApp,
+						speciesForApp);
+
+			if (matcher.find()) {
+				event.setApplicationOrPermitNo(matcher.group(1));
+				newEvents.add(event);
+				event = null;
+			}
+		}
+		
+		ResearchEvent eventForPara = null;
+		for (TextPart text : info.getSupplInfos()) {
+			for (Paragraph para : text.getParagraphs()) {
+				int startOfPara = text.getPositionInTextFromParagraph(para, 0);
+				int endOfPara = startOfPara + para.getParagraphText().length();
+				
+				Pattern pattern = Pattern.compile(applicationNrPattern);
+				Matcher matcher = pattern.matcher(para.getParagraphText());
+
+				IfStatement: if (matcher.find()) {
+					String numberInPara = matcher.group(1);
+					for (ResearchEvent ev : newEvents) {
+						if (numberInPara.equals(ev.getApplicationOrPermitNo()))
+						{
+							eventForPara = ev;
+							break IfStatement;
+						}
+					}
+					eventForPara = null;
+					continue;
+				}
+				
+				if (eventForPara == null)
+					continue;
+				
+				for (LocationResult loc : locationsInSuppleInf) {
+					if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) {
+						continue;
+					}
+					
+					if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) {
+						if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION)
+							eventForPara.getResearchLocations().add(createLocation(loc));
+					}
+				}
+				
+				for (SpeciesScientificResult spec : speciesInSuppleInf) {
+					if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) {
+						continue;
+					}
+					
+					if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) {
+						eventForPara.getResearchedSpecies().add(createSpecies(spec));
+					}
+				}
+			}
+		}
+		
+		events.addAll(newEvents);
+	}
+
+	@Override
+	public int getRank() {
+		return 10;
+	}
+
+}