Mercurial > hg > anteater
view src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children | 50aeb96a8ee9 |
line wrap: on
line source
package de.mpiwg.anteater.persons.ml.preprocessing; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.mpiwg.anteater.core.Finding; import de.mpiwg.anteater.ml.ITextParser; import de.mpiwg.anteater.ml.SimilarityHelper; import de.mpiwg.anteater.ml.impl.Word; import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; import de.mpiwg.anteater.persons.APerson; import de.mpiwg.anteater.places.PlaceInformation; import de.mpiwg.anteater.species.scientific.ScientificName; import de.mpiwg.anteater.text.TextPart; public class ApplicantFeatureCalculator extends FeatureCalculator { public ApplicantFeatureCalculator(List<String> sentenceList, ITextParser parser, TextPart text) { super(sentenceList, parser, text.getText()); } private List<PlaceInformation> places; private List<ScientificName> names; public List<ScientificName> getNames() { return names; } public void setNames(List<ScientificName> names) { this.names = names; } public int getSentenceContainsIssued(APerson candidate) { return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart()); } public int getSentenceContainsApplied(APerson candidate) { return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart()); } public int getSentenceContainsPermit(APerson candidate) { return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart()); } public int getSentenceContainsComment(APerson candidate) { return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart()); } public int getSentenceContainsApplicant(APerson candidate) { return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart()); } public int getDistanceCandidateToApplicant(APerson candidate) { if (getSentenceContainsApplicant(candidate) == 0) return 0; String sentence = getSentenceContainingCandidate(candidate.getStart()); int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT); return posOfApplicant - candidate.getStart(); } public int getIsSubject(APerson candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; List<Word> subjects = parser.getSubjects(sentence); for (Word subj : subjects) { if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength())) return 1; } return 0; } public int getIsAbbreviation(APerson candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; List<Word> abbrevs = parser.getAbbreviations(sentence); for (Word abbr : abbrevs) { if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength())) return 1; } return 0; } /** * matching substring/person name * @param candidate * @return */ public float getSimilarityPersonNameForPerson(APerson candidate) { int index = candidate.getStart(); for (ScientificName name : names) { if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); return substring.length()/candidate.getLength(); } } return 0; } public float getSimilarityPersonNameForName(APerson candidate) { int index = candidate.getStart(); for (ScientificName name : names) { if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); return substring.length()/name.getLength(); } } return 0; } public int doPersonAndNameStartAtSameIdx(APerson candidate) { int index = candidate.getStart(); for (ScientificName name : names) { if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { if (index == name.getStart()) return 1; return 0; } } return 0; } public float getSimilarityPersonPlaceForPerson(APerson candidate) { int index = candidate.getStart(); for (Finding place : places) { if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); return substring.length()/candidate.getLength(); } } return 0; } public float getSimilarityPersonPlaceForPlace(APerson candidate) { int index = candidate.getStart(); for (Finding place : places) { if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); return substring.length()/place.getLength(); } } return 0; } public int doPersonAndPlaceStartAtSameIdx(APerson candidate) { int index = candidate.getStart(); for (Finding place : places) { if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { if (index == place.getStart()) return 1; return 0; } } return 0; } public int isSurroundedByBrackets(APerson candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)"); Matcher matcher = pattern.matcher(sentence); if (matcher.find()) return 1; return 0; } public int isSurroundedByCommata(APerson candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; if (sentence.contains(", " + candidate.getReferenceInText() + ",")) return 1; return 0; } public int isFollowedBy_s(APerson candidate) { String sentence = getSentenceContainingCandidate(candidate.getStart()); if (sentence == null) return 0; if (sentence.contains(candidate.getReferenceInText() + "'s")) return 1; return 0; } public void setPlaces(List<PlaceInformation> places) { this.places = places; } public List<PlaceInformation> getPlaces() { return places; } }