view src/de/mpiwg/anteater/persons/regex/ApplicantRegexFinder.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
line wrap: on
line source

package de.mpiwg.anteater.persons.regex;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.mpiwg.anteater.AnteaterConfiguration;
import de.mpiwg.anteater.persons.APerson;
import de.mpiwg.anteater.persons.Person;
import de.mpiwg.anteater.persons.PersonsExtraction;
import de.mpiwg.anteater.results.ApplicantResult;
import de.mpiwg.anteater.text.Paragraph;
import de.mpiwg.anteater.text.TextInformation;
import de.mpiwg.anteater.text.TextPart;
import de.mpiwg.anteater.text.TextType;

public class ApplicantRegexFinder {
	
	public final static String COMPONENT_NAME = ApplicantRegexFinder.class.getSimpleName();
	
	private String applicantRegex = "Applicant:\\p{Blank}{0,1}([A-Z]{2}-.+?,){0,1}\\p{Blank}{0,1}(.+?,( {0,1}Inc(.){0,1}| {0,1}LLC){0,1})";
	
	private AnteaterConfiguration configuration;
	
	public ApplicantRegexFinder(AnteaterConfiguration configuration) {
		this.configuration = configuration;
	}

	public List<ApplicantResult> findApplicants(List<TextInformation> infos) {
		configuration.getLogger().logMessage(COMPONENT_NAME, "Search for applicants with regular expressions.");
		
		List<ApplicantResult> results = new ArrayList<ApplicantResult>();
		
		for (TextInformation info : infos) {
			List<PersonsExtraction> extractions = info.getPersonsExtractions();
			
			
			// find applicants in summaries
			List<TextPart> summaries = info.getSummaries();
			for (TextPart summary : summaries) {
				
				PersonsExtraction pExtraction =findExtraction(extractions, TextType.TYPE_SUMMARY, summaries.indexOf(summary));
				results.addAll(findApplicantsInText(info, pExtraction, summary));
			}
			
			// find applicants in suppleInf
			List<TextPart> suppleInf = info.getSupplInfos();
			for (TextPart sInf : suppleInf) {
				PersonsExtraction pExtraction = findExtraction(extractions, TextType.TYPE_SUPLINF, suppleInf.indexOf(sInf));
				results.addAll(findApplicantsInText(info, pExtraction, sInf));
			}
		}
		
		configuration.getLogger().logMessage(COMPONENT_NAME, "Found " + results.size() + " applicant(s).");
		
		return results;
	}
	
	private PersonsExtraction findExtraction(List<PersonsExtraction> extractions, int textType, int textIdx) {
		PersonsExtraction pExtraction = null;
		
		for (PersonsExtraction extr : extractions) {
			if (extr.getType() == textType && extr.getTextIdx() == textIdx) {
				pExtraction = extr;
				break;
			}
		}
		
		if (pExtraction == null) {
			pExtraction = new PersonsExtraction();
			pExtraction.setType(textType);
			pExtraction.setTextIdx(textIdx);
			pExtraction.setPerson(new ArrayList<APerson>());
			extractions.add(pExtraction);
		}
		
		return pExtraction;
	}

	private List<ApplicantResult> findApplicantsInText(TextInformation info, PersonsExtraction pExtraction,
			TextPart textPart) {
		List<Paragraph> paragraphs = textPart.getParagraphs();
		// find extraction object in list
		
		
		
		List<ApplicantResult> results = new ArrayList<ApplicantResult>();
		for (Paragraph para : paragraphs) {
			Map<Integer, String> applicants = findApplicants(para.getParagraphText());
			for (Integer pos : applicants.keySet()) {
				// find out if person was already found
				APerson person = null;
				int posInText = textPart.getPositionInTextFromParagraph(para, pos);
				for (APerson p : pExtraction.getPersons())
					if (p.getStart() == posInText) {
						person = p;
						break;
					}
				
				if (person == null) {
					person = new Person();
					person.setStart(posInText);
					pExtraction.getPersons().add(person);
				}
				person.setLength(applicants.get(pos).length());
				person.setReferenceInText(applicants.get(pos));
										
				ApplicantResult result = new ApplicantResult();
				result.setPrediction(2.0);
				result.setFinding(person);
				result.setResult(pExtraction);
				result.setTextInfo(info);
				results.add(result);
			}
		}
		
		return results;
	}
	
	public Map<Integer, String> findApplicants(String text) {
		String paraText = text;
		Pattern pattern = Pattern.compile(applicantRegex);
		Matcher match = pattern.matcher(paraText);
		
		Map<Integer, String> applicants = new HashMap<Integer, String>();
		while (match.find()) {
			String applicant = match.group(2);
			int pos = match.start(2);
			if (applicant.endsWith(","))
				applicant = applicant.substring(0, applicant.length() - 1);
			applicants.put(pos, applicant);				
		}
		
		return applicants;
	}
}