Mercurial > hg > anteater
diff src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children | 50aeb96a8ee9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,201 @@ +package de.mpiwg.anteater.persons.ml.preprocessing; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.mpiwg.anteater.core.Finding; +import de.mpiwg.anteater.ml.ITextParser; +import de.mpiwg.anteater.ml.SimilarityHelper; +import de.mpiwg.anteater.ml.impl.Word; +import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; +import de.mpiwg.anteater.persons.APerson; +import de.mpiwg.anteater.places.PlaceInformation; +import de.mpiwg.anteater.species.scientific.ScientificName; +import de.mpiwg.anteater.text.TextPart; + +public class ApplicantFeatureCalculator extends FeatureCalculator { + + public ApplicantFeatureCalculator(List<String> sentenceList, + ITextParser parser, TextPart text) { + super(sentenceList, parser, text.getText()); + } + + private List<PlaceInformation> places; + private List<ScientificName> names; + + public List<ScientificName> getNames() { + return names; + } + + public void setNames(List<ScientificName> names) { + this.names = names; + } + + public int getSentenceContainsIssued(APerson candidate) { + return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart()); + } + + public int getSentenceContainsApplied(APerson candidate) { + return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart()); + } + + public int getSentenceContainsPermit(APerson candidate) { + return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart()); + } + + public int getSentenceContainsComment(APerson candidate) { + return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart()); + } + public int getSentenceContainsApplicant(APerson candidate) { + return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart()); + } + + public int getDistanceCandidateToApplicant(APerson candidate) { + if (getSentenceContainsApplicant(candidate) == 0) + return 0; + + String sentence = getSentenceContainingCandidate(candidate.getStart()); + int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT); + return posOfApplicant - candidate.getStart(); + } + + public int getIsSubject(APerson candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + List<Word> subjects = parser.getSubjects(sentence); + for (Word subj : subjects) { + if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength())) + return 1; + } + return 0; + } + + public int getIsAbbreviation(APerson candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + List<Word> abbrevs = parser.getAbbreviations(sentence); + for (Word abbr : abbrevs) { + if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength())) + return 1; + } + return 0; + } + + /** + * matching substring/person name + * @param candidate + * @return + */ + public float getSimilarityPersonNameForPerson(APerson candidate) { + int index = candidate.getStart(); + for (ScientificName name : names) { + if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { + String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); + return substring.length()/candidate.getLength(); + } + } + return 0; + } + + public float getSimilarityPersonNameForName(APerson candidate) { + int index = candidate.getStart(); + for (ScientificName name : names) { + if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { + String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); + return substring.length()/name.getLength(); + } + } + return 0; + } + + public int doPersonAndNameStartAtSameIdx(APerson candidate) { + int index = candidate.getStart(); + for (ScientificName name : names) { + if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { + if (index == name.getStart()) + return 1; + return 0; + } + } + return 0; + } + + public float getSimilarityPersonPlaceForPerson(APerson candidate) { + int index = candidate.getStart(); + for (Finding place : places) { + if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { + String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); + return substring.length()/candidate.getLength(); + } + } + return 0; + } + + public float getSimilarityPersonPlaceForPlace(APerson candidate) { + int index = candidate.getStart(); + for (Finding place : places) { + if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { + String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); + return substring.length()/place.getLength(); + } + } + return 0; + } + + public int doPersonAndPlaceStartAtSameIdx(APerson candidate) { + int index = candidate.getStart(); + for (Finding place : places) { + if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { + if (index == place.getStart()) + return 1; + return 0; + } + } + return 0; + } + + public int isSurroundedByBrackets(APerson candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)"); + Matcher matcher = pattern.matcher(sentence); + if (matcher.find()) + return 1; + return 0; + } + + public int isSurroundedByCommata(APerson candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + if (sentence.contains(", " + candidate.getReferenceInText() + ",")) + return 1; + return 0; + } + + public int isFollowedBy_s(APerson candidate) { + String sentence = getSentenceContainingCandidate(candidate.getStart()); + if (sentence == null) + return 0; + + if (sentence.contains(candidate.getReferenceInText() + "'s")) + return 1; + return 0; + } + + public void setPlaces(List<PlaceInformation> places) { + this.places = places; + } + + public List<PlaceInformation> getPlaces() { + return places; + } +}