diff src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children 50aeb96a8ee9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java	Fri Sep 14 10:30:43 2012 +0200
@@ -0,0 +1,201 @@
+package de.mpiwg.anteater.persons.ml.preprocessing;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.mpiwg.anteater.core.Finding;
+import de.mpiwg.anteater.ml.ITextParser;
+import de.mpiwg.anteater.ml.SimilarityHelper;
+import de.mpiwg.anteater.ml.impl.Word;
+import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
+import de.mpiwg.anteater.persons.APerson;
+import de.mpiwg.anteater.places.PlaceInformation;
+import de.mpiwg.anteater.species.scientific.ScientificName;
+import de.mpiwg.anteater.text.TextPart;
+
+public class ApplicantFeatureCalculator extends FeatureCalculator {
+
+	public ApplicantFeatureCalculator(List<String> sentenceList,
+			ITextParser parser, TextPart text) {
+		super(sentenceList, parser, text.getText());
+	}
+
+	private List<PlaceInformation> places;
+	private List<ScientificName> names;
+	
+	public List<ScientificName> getNames() {
+		return names;
+	}
+
+	public void setNames(List<ScientificName> names) {
+		this.names = names;
+	}
+	
+	public int getSentenceContainsIssued(APerson candidate) {
+		return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart());
+	}
+	
+	public int getSentenceContainsApplied(APerson candidate) {
+		return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart());
+	}
+	
+	public int getSentenceContainsPermit(APerson candidate) {
+		return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart());
+	}
+	
+	public int getSentenceContainsComment(APerson candidate) {
+		return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart());
+	}
+	public int getSentenceContainsApplicant(APerson candidate) {
+		return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart());
+	}
+	
+	public int getDistanceCandidateToApplicant(APerson candidate) {
+		if (getSentenceContainsApplicant(candidate) == 0)
+			return 0;
+		
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT);
+		return posOfApplicant - candidate.getStart();
+	}
+	
+	public int getIsSubject(APerson candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+		
+		List<Word> subjects = parser.getSubjects(sentence);
+		for (Word subj : subjects) {
+			if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength()))
+			return 1;
+		}
+		return 0;
+	}
+	
+	public int getIsAbbreviation(APerson candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+		
+		List<Word> abbrevs = parser.getAbbreviations(sentence);
+		for (Word abbr : abbrevs) {
+			if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength()))
+			return 1;
+		}
+		return 0;
+	}
+	
+	/**
+	 * matching substring/person name
+	 * @param candidate
+	 * @return
+	 */
+	public float getSimilarityPersonNameForPerson(APerson candidate) {
+		int index = candidate.getStart();
+		for (ScientificName name : names) {
+			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
+				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
+				return substring.length()/candidate.getLength();
+			}
+		}
+		return 0;
+	}
+	
+	public float getSimilarityPersonNameForName(APerson candidate) {
+		int index = candidate.getStart();
+		for (ScientificName name : names) {
+			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
+				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
+				return substring.length()/name.getLength();
+			}
+		}
+		return 0;
+	}
+	
+	public int doPersonAndNameStartAtSameIdx(APerson candidate) {
+		int index = candidate.getStart();
+		for (ScientificName name : names) {
+			if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
+				if (index == name.getStart())
+					return 1;
+				return 0;
+			}
+		}
+		return 0;
+	}
+	
+	public float getSimilarityPersonPlaceForPerson(APerson candidate) {
+		int index = candidate.getStart();
+		for (Finding place : places) {
+			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
+				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
+				return substring.length()/candidate.getLength();
+			}
+		}
+		return 0;
+	}
+	
+	public float getSimilarityPersonPlaceForPlace(APerson candidate) {
+		int index = candidate.getStart();
+		for (Finding place : places) {
+			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
+				String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
+				return substring.length()/place.getLength();
+			}
+		}
+		return 0;
+	}
+	
+	public int doPersonAndPlaceStartAtSameIdx(APerson candidate) {
+		int index = candidate.getStart();
+		for (Finding place : places) {
+			if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
+				if (index == place.getStart())
+					return 1;
+				return 0;
+			}
+		}
+		return 0;
+	}
+	
+	public int isSurroundedByBrackets(APerson candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+		
+		Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)");
+		Matcher matcher = pattern.matcher(sentence);
+		if (matcher.find())
+			return 1;
+		return 0;
+	}
+	
+	public int isSurroundedByCommata(APerson candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+		
+		if (sentence.contains(", " + candidate.getReferenceInText() + ","))
+			return 1;
+		return 0;
+	}
+	
+	public int isFollowedBy_s(APerson candidate) {
+		String sentence = getSentenceContainingCandidate(candidate.getStart());
+		if (sentence == null)
+			return 0;
+		
+		if (sentence.contains(candidate.getReferenceInText() + "'s"))
+			return 1;
+		return 0;
+	}
+	
+	public void setPlaces(List<PlaceInformation> places) {
+		this.places = places;
+	}
+
+	public List<PlaceInformation> getPlaces() {
+		return places;
+	}
+}