view src/de/mpiwg/anteater/events/processors/PermitOrApplicantEventProcessor.java @ 1:7a4341c9f2e5

checking permit numbers for similarity if no direct match
author jdamerow
date Fri, 05 Oct 2012 18:52:14 -0700
parents 036535fcd179
children
line wrap: on
line source

package de.mpiwg.anteater.events.processors;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;

import de.mpiwg.anteater.events.Applicant;
import de.mpiwg.anteater.events.ResearchEvent;
import de.mpiwg.anteater.ml.PlaceClasses;
import de.mpiwg.anteater.results.ApplicantResult;
import de.mpiwg.anteater.results.LocationResult;
import de.mpiwg.anteater.results.ResultsCarrier;
import de.mpiwg.anteater.results.SpeciesScientificResult;
import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.TextInformation;
import de.mpiwg.anteater.text.TextPart;

public class PermitOrApplicantEventProcessor extends AEventProcessor {

	private String applicationNrPattern = "No. ([A-Z0-9\\-]{2,})";

	@Override
	public void processEvents(List<ResearchEvent> events, ResultsCarrier carrier) {
		TextInformation info = carrier.getTextInfo();

		// find how many distinct applicants there are
		List<ApplicantResult> distinctApplicants = getDistinctApplicants(carrier);

		if (distinctApplicants.size() == 1) {
			return;
		}

		// if there are several applicants start new event with each applicant
		List<ApplicantResult> appsInSummary = new ArrayList<ApplicantResult>();
		List<ApplicantResult> appsInSuppleInf = new ArrayList<ApplicantResult>();

		sortByTextType(distinctApplicants, appsInSummary, appsInSuppleInf);

		if (appsInSummary.isEmpty()) {
			return;
		}

		List<ResearchEvent> newEvents = new ArrayList<ResearchEvent>();
		
		List<SpeciesScientificResult> speciesInSummary = new ArrayList<SpeciesScientificResult>();
		List<SpeciesScientificResult> speciesInSuppleInf = new ArrayList<SpeciesScientificResult>();
		sortSpeciesByTextType(carrier.getSpeciesResults(), speciesInSummary, speciesInSuppleInf);
		
		List<LocationResult> locationsInSummary = new ArrayList<LocationResult>();
		List<LocationResult> locationsInSuppleInf = new ArrayList<LocationResult>();
		sortLocsByTextType(carrier.getLocationResults(), locationsInSummary, locationsInSuppleInf);


		ResearchEvent event = null;
		for (ApplicantResult appResult : appsInSummary) {
			TextPart text = info.getSummaries().get(
					appResult.getResult().getTextIdx());

			int startSearchForNumberAt = appResult.getFinding().getStart()
					+ appResult.getFinding().getLength();

			int endSearchForNumber = text.getText().length();
			if (appsInSummary.indexOf(appResult) < appsInSummary.size() - 1)
				endSearchForNumber = appsInSummary
						.get(appsInSummary.indexOf(appResult) + 1).getFinding()
						.getStart();

			String textAsString = text.getText();
			String textAfterApplicant = textAsString.substring(
					startSearchForNumberAt, endSearchForNumber);

			// check if there is an application number
			Pattern pattern = Pattern.compile(applicationNrPattern);
			Matcher matcher = pattern.matcher(textAfterApplicant);

			List<LocationResult> locationsForApp = new ArrayList<LocationResult>();
			// find all locations between current applicant and next one
			for (LocationResult locationResult : locationsInSummary) {
				if (locationResult.getFinding().getStart() >= startSearchForNumberAt
						&& locationResult.getFinding().getStart() < endSearchForNumber)
					locationsForApp.add(locationResult);
			}

			List<SpeciesScientificResult> speciesForApp = new ArrayList<SpeciesScientificResult>();
			// find all species between current applicant and next one
			for (SpeciesScientificResult speciesResult : speciesInSummary) {
				if (speciesResult.getFinding().getStart() > startSearchForNumberAt
						&& speciesResult.getFinding().getStart() < endSearchForNumber)
					speciesForApp.add(speciesResult);
			}

			if (event != null) {
				List<Applicant> applicantsInEvent = event.getApplicants();
				List<Applicant> applicantsWithoutLoc = new ArrayList<Applicant>();

				Applicant newApplicant = createApplicant(appResult);
				event.getApplicants().add(newApplicant);

				for (Applicant applicantInEvent : applicantsInEvent)
					if (applicantInEvent.getApplicantInstitution().isEmpty()
							&& applicantInEvent.getLocation().isEmpty())
						applicantsWithoutLoc.add(applicantInEvent);

				setLocations(applicantsWithoutLoc, locationsForApp, event);
				setSpecies(speciesForApp, event);
			}

			if (event == null)
				event = createEvent(appResult, info, locationsForApp,
						speciesForApp);

			if (matcher.find()) {
				event.setApplicationOrPermitNo(matcher.group(1));
				newEvents.add(event);
				event = null;
			}
		}
		
		ResearchEvent eventForPara = null;
		for (TextPart text : info.getSupplInfos()) {
			for (Paragraph para : text.getParagraphs()) {
				int startOfPara = text.getPositionInTextFromParagraph(para, 0);
				int endOfPara = startOfPara + para.getParagraphText().length();
				
				Pattern pattern = Pattern.compile(applicationNrPattern);
				Matcher matcher = pattern.matcher(para.getParagraphText());

				IfStatement: if (matcher.find()) {
					String numberInPara = matcher.group(1);
					// check if found number exisits in events
					for (ResearchEvent ev : newEvents) {
						if (numberInPara.equals(ev.getApplicationOrPermitNo()))
						{
							eventForPara = ev;
							break IfStatement;
						}
					}
					// if there is no event with permit number
					// check if just something was clipped
					for (ResearchEvent ev : newEvents) {
						if (ev.getApplicationOrPermitNo().contains(numberInPara) || numberInPara.contains(ev.getApplicationOrPermitNo())) {
							eventForPara = ev;
							break IfStatement;
						}
					}
					// if there is still no event found
					// check for switched numbers
					for (ResearchEvent ev : newEvents) {
						JaroWinkler winkler = new JaroWinkler();
						double sim = winkler.getSimilarity(numberInPara, ev.getApplicationOrPermitNo());
						if (sim > 0.85)
						{
							eventForPara = ev;
							break IfStatement;
						}
					}
					eventForPara = null;
					continue;
				}
				
				if (eventForPara == null)
					continue;
				
				for (LocationResult loc : locationsInSuppleInf) {
					if (info.getSupplInfos().indexOf(text) != loc.getResult().getTextIdx()) {
						continue;
					}
					
					if (loc.getFinding().getStart() >= startOfPara && loc.getFinding().getStart() < endOfPara) {
						if (loc.getPrediction() == PlaceClasses.RESEARCH_LOCATION)
							eventForPara.getResearchLocations().add(createLocation(loc));
					}
				}
				
				for (SpeciesScientificResult spec : speciesInSuppleInf) {
					if (info.getSupplInfos().indexOf(text) != spec.getResult().getTextIdx()) {
						continue;
					}
					
					if (spec.getFinding().getStart() >= startOfPara && spec.getFinding().getStart() < endOfPara) {
						eventForPara.getResearchedSpecies().add(createSpecies(spec));
					}
				}
			}
		}
		
		events.addAll(newEvents);
	}

	@Override
	public int getRank() {
		return 10;
	}

}