comparison src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children 50aeb96a8ee9
comparison
equal deleted inserted replaced
-1:000000000000 0:036535fcd179
1 package de.mpiwg.anteater.persons.ml.preprocessing;
2
3 import java.util.List;
4 import java.util.regex.Matcher;
5 import java.util.regex.Pattern;
6
7 import de.mpiwg.anteater.core.Finding;
8 import de.mpiwg.anteater.ml.ITextParser;
9 import de.mpiwg.anteater.ml.SimilarityHelper;
10 import de.mpiwg.anteater.ml.impl.Word;
11 import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
12 import de.mpiwg.anteater.persons.APerson;
13 import de.mpiwg.anteater.places.PlaceInformation;
14 import de.mpiwg.anteater.species.scientific.ScientificName;
15 import de.mpiwg.anteater.text.TextPart;
16
17 public class ApplicantFeatureCalculator extends FeatureCalculator {
18
19 public ApplicantFeatureCalculator(List<String> sentenceList,
20 ITextParser parser, TextPart text) {
21 super(sentenceList, parser, text.getText());
22 }
23
24 private List<PlaceInformation> places;
25 private List<ScientificName> names;
26
27 public List<ScientificName> getNames() {
28 return names;
29 }
30
31 public void setNames(List<ScientificName> names) {
32 this.names = names;
33 }
34
35 public int getSentenceContainsIssued(APerson candidate) {
36 return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart());
37 }
38
39 public int getSentenceContainsApplied(APerson candidate) {
40 return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart());
41 }
42
43 public int getSentenceContainsPermit(APerson candidate) {
44 return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart());
45 }
46
47 public int getSentenceContainsComment(APerson candidate) {
48 return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart());
49 }
50 public int getSentenceContainsApplicant(APerson candidate) {
51 return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart());
52 }
53
54 public int getDistanceCandidateToApplicant(APerson candidate) {
55 if (getSentenceContainsApplicant(candidate) == 0)
56 return 0;
57
58 String sentence = getSentenceContainingCandidate(candidate.getStart());
59 int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT);
60 return posOfApplicant - candidate.getStart();
61 }
62
63 public int getIsSubject(APerson candidate) {
64 String sentence = getSentenceContainingCandidate(candidate.getStart());
65 if (sentence == null)
66 return 0;
67
68 List<Word> subjects = parser.getSubjects(sentence);
69 for (Word subj : subjects) {
70 if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength()))
71 return 1;
72 }
73 return 0;
74 }
75
76 public int getIsAbbreviation(APerson candidate) {
77 String sentence = getSentenceContainingCandidate(candidate.getStart());
78 if (sentence == null)
79 return 0;
80
81 List<Word> abbrevs = parser.getAbbreviations(sentence);
82 for (Word abbr : abbrevs) {
83 if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength()))
84 return 1;
85 }
86 return 0;
87 }
88
89 /**
90 * matching substring/person name
91 * @param candidate
92 * @return
93 */
94 public float getSimilarityPersonNameForPerson(APerson candidate) {
95 int index = candidate.getStart();
96 for (ScientificName name : names) {
97 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
98 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
99 return substring.length()/candidate.getLength();
100 }
101 }
102 return 0;
103 }
104
105 public float getSimilarityPersonNameForName(APerson candidate) {
106 int index = candidate.getStart();
107 for (ScientificName name : names) {
108 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
109 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText());
110 return substring.length()/name.getLength();
111 }
112 }
113 return 0;
114 }
115
116 public int doPersonAndNameStartAtSameIdx(APerson candidate) {
117 int index = candidate.getStart();
118 for (ScientificName name : names) {
119 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) {
120 if (index == name.getStart())
121 return 1;
122 return 0;
123 }
124 }
125 return 0;
126 }
127
128 public float getSimilarityPersonPlaceForPerson(APerson candidate) {
129 int index = candidate.getStart();
130 for (Finding place : places) {
131 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
132 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
133 return substring.length()/candidate.getLength();
134 }
135 }
136 return 0;
137 }
138
139 public float getSimilarityPersonPlaceForPlace(APerson candidate) {
140 int index = candidate.getStart();
141 for (Finding place : places) {
142 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
143 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText());
144 return substring.length()/place.getLength();
145 }
146 }
147 return 0;
148 }
149
150 public int doPersonAndPlaceStartAtSameIdx(APerson candidate) {
151 int index = candidate.getStart();
152 for (Finding place : places) {
153 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) {
154 if (index == place.getStart())
155 return 1;
156 return 0;
157 }
158 }
159 return 0;
160 }
161
162 public int isSurroundedByBrackets(APerson candidate) {
163 String sentence = getSentenceContainingCandidate(candidate.getStart());
164 if (sentence == null)
165 return 0;
166
167 Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)");
168 Matcher matcher = pattern.matcher(sentence);
169 if (matcher.find())
170 return 1;
171 return 0;
172 }
173
174 public int isSurroundedByCommata(APerson candidate) {
175 String sentence = getSentenceContainingCandidate(candidate.getStart());
176 if (sentence == null)
177 return 0;
178
179 if (sentence.contains(", " + candidate.getReferenceInText() + ","))
180 return 1;
181 return 0;
182 }
183
184 public int isFollowedBy_s(APerson candidate) {
185 String sentence = getSentenceContainingCandidate(candidate.getStart());
186 if (sentence == null)
187 return 0;
188
189 if (sentence.contains(candidate.getReferenceInText() + "'s"))
190 return 1;
191 return 0;
192 }
193
194 public void setPlaces(List<PlaceInformation> places) {
195 this.places = places;
196 }
197
198 public List<PlaceInformation> getPlaces() {
199 return places;
200 }
201 }