view src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children 50aeb96a8ee9
line wrap: on
line source

package de.mpiwg.anteater.persons.ml.preprocessing;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.mpiwg.anteater.core.Finding;
import de.mpiwg.anteater.ml.ITextParser;
import de.mpiwg.anteater.ml.SimilarityHelper;
import de.mpiwg.anteater.ml.impl.Word;
import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
import de.mpiwg.anteater.persons.APerson;
import de.mpiwg.anteater.places.PlaceInformation;
import de.mpiwg.anteater.species.scientific.ScientificName;
import de.mpiwg.anteater.text.TextPart;

public class ApplicantFeatureCalculator extends FeatureCalculator {

	public ApplicantFeatureCalculator(List<String> sentenceList,
			ITextParser parser, TextPart text) {
		super(sentenceList, parser, text.getText());
	}

	private List<PlaceInformation> places;
	private List<ScientificName> names;
	
	public List<ScientificName> getNames() {
		return names;
	}

	public void setNames(List<ScientificName> names) {
		this.names = names;
	}
	
	public int getSentenceContainsIssued(APerson candidate) {
		return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart());
	}
	
	public int getSentenceContainsApplied(APerson candidate) {
		return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart());
	}
	
	public int getSentenceContainsPermit(APerson candidate) {
		return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart());
	}
	
	public int getSentenceContainsComment(APerson candidate) {
		return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart());
	}
	public int getSentenceContainsApplicant(APerson candidate) {
		return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart());
	}
	
	public int getDistanceCandidateToApplicant(APerson candidate) {
		if (getSentenceContainsApplicant(candidate) == 0)
			return 0;
		
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT);
		return posOfApplicant - candidate.getStart();
	}
	
	public int getIsSubject(APerson candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;
		
		List<Word> subjects = parser.getSubjects(sentence);
		for (Word subj : subjects) {
			if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength()))
			return 1;
		}
		return 0;
	}
	
	public int getIsAbbreviation(APerson candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;
		
		List<Word> abbrevs = parser.getAbbreviations(sentence);
		for (Word abbr : abbrevs) {
			if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength()))
			return 1;
		}
		return 0;
	}
	
	/**
	 * matching substring/person name
	 * @param candidate
	 * @return
	 */
	public float getSimilarityPersonNameForPerson(APerson candidate) {
		int index = candidate.getStart();
		for (ScientificName name : names) {
			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
				return substring.length()/candidate.getLength();
			}
		}
		return 0;
	}
	
	public float getSimilarityPersonNameForName(APerson candidate) {
		int index = candidate.getStart();
		for (ScientificName name : names) {
			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
				return substring.length()/name.getLength();
			}
		}
		return 0;
	}
	
	public int doPersonAndNameStartAtSameIdx(APerson candidate) {
		int index = candidate.getStart();
		for (ScientificName name : names) {
			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
				if (index == name.getStart())
					return 1;
				return 0;
			}
		}
		return 0;
	}
	
	public float getSimilarityPersonPlaceForPerson(APerson candidate) {
		int index = candidate.getStart();
		for (Finding place : places) {
			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
				return substring.length()/candidate.getLength();
			}
		}
		return 0;
	}
	
	public float getSimilarityPersonPlaceForPlace(APerson candidate) {
		int index = candidate.getStart();
		for (Finding place : places) {
			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
				return substring.length()/place.getLength();
			}
		}
		return 0;
	}
	
	public int doPersonAndPlaceStartAtSameIdx(APerson candidate) {
		int index = candidate.getStart();
		for (Finding place : places) {
			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
				if (index == place.getStart())
					return 1;
				return 0;
			}
		}
		return 0;
	}
	
	public int isSurroundedByBrackets(APerson candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;
		
		Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)");
		Matcher matcher = pattern.matcher(sentence);
		if (matcher.find())
			return 1;
		return 0;
	}
	
	public int isSurroundedByCommata(APerson candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;
		
		if (sentence.contains(", " + candidate.getReferenceInText() + ","))
			return 1;
		return 0;
	}
	
	public int isFollowedBy_s(APerson candidate) {
		String sentence = getSentenceContainingCandidate(candidate.getStart());
		if (sentence == null)
			return 0;
		
		if (sentence.contains(candidate.getReferenceInText() + "'s"))
			return 1;
		return 0;
	}
	
	public void setPlaces(List<PlaceInformation> places) {
		this.places = places;
	}

	public List<PlaceInformation> getPlaces() {
		return places;
	}
}