Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children | 50aeb96a8ee9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:036535fcd179 |
---|---|
1 package de.mpiwg.anteater.persons.ml.preprocessing; | |
2 | |
3 import java.util.List; | |
4 import java.util.regex.Matcher; | |
5 import java.util.regex.Pattern; | |
6 | |
7 import de.mpiwg.anteater.core.Finding; | |
8 import de.mpiwg.anteater.ml.ITextParser; | |
9 import de.mpiwg.anteater.ml.SimilarityHelper; | |
10 import de.mpiwg.anteater.ml.impl.Word; | |
11 import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; | |
12 import de.mpiwg.anteater.persons.APerson; | |
13 import de.mpiwg.anteater.places.PlaceInformation; | |
14 import de.mpiwg.anteater.species.scientific.ScientificName; | |
15 import de.mpiwg.anteater.text.TextPart; | |
16 | |
17 public class ApplicantFeatureCalculator extends FeatureCalculator { | |
18 | |
19 public ApplicantFeatureCalculator(List<String> sentenceList, | |
20 ITextParser parser, TextPart text) { | |
21 super(sentenceList, parser, text.getText()); | |
22 } | |
23 | |
24 private List<PlaceInformation> places; | |
25 private List<ScientificName> names; | |
26 | |
27 public List<ScientificName> getNames() { | |
28 return names; | |
29 } | |
30 | |
31 public void setNames(List<ScientificName> names) { | |
32 this.names = names; | |
33 } | |
34 | |
35 public int getSentenceContainsIssued(APerson candidate) { | |
36 return getSentenceContainsKeyword(KEYWORD_ISSUED, candidate.getStart()); | |
37 } | |
38 | |
39 public int getSentenceContainsApplied(APerson candidate) { | |
40 return getSentenceContainsKeyword(KEYWORD_APPLIED, candidate.getStart()); | |
41 } | |
42 | |
43 public int getSentenceContainsPermit(APerson candidate) { | |
44 return getSentenceContainsKeyword(KEYWORD_PERMIT, candidate.getStart()); | |
45 } | |
46 | |
47 public int getSentenceContainsComment(APerson candidate) { | |
48 return getSentenceContainsKeyword(KEYWORD_COMMENT, candidate.getStart()); | |
49 } | |
50 public int getSentenceContainsApplicant(APerson candidate) { | |
51 return getSentenceContainsKeyword(KEYWORD_APPLICANT, candidate.getStart()); | |
52 } | |
53 | |
54 public int getDistanceCandidateToApplicant(APerson candidate) { | |
55 if (getSentenceContainsApplicant(candidate) == 0) | |
56 return 0; | |
57 | |
58 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
59 int posOfApplicant = sentence.indexOf(KEYWORD_APPLICANT); | |
60 return posOfApplicant - candidate.getStart(); | |
61 } | |
62 | |
63 public int getIsSubject(APerson candidate) { | |
64 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
65 if (sentence == null) | |
66 return 0; | |
67 | |
68 List<Word> subjects = parser.getSubjects(sentence); | |
69 for (Word subj : subjects) { | |
70 if (subj.getIndex() >= candidate.getStart() && subj.getIndex() < (candidate.getStart() + candidate.getLength())) | |
71 return 1; | |
72 } | |
73 return 0; | |
74 } | |
75 | |
76 public int getIsAbbreviation(APerson candidate) { | |
77 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
78 if (sentence == null) | |
79 return 0; | |
80 | |
81 List<Word> abbrevs = parser.getAbbreviations(sentence); | |
82 for (Word abbr : abbrevs) { | |
83 if (abbr.getIndex() >= candidate.getStart() && abbr.getIndex() < (candidate.getStart() + candidate.getLength())) | |
84 return 1; | |
85 } | |
86 return 0; | |
87 } | |
88 | |
89 /** | |
90 * matching substring/person name | |
91 * @param candidate | |
92 * @return | |
93 */ | |
94 public float getSimilarityPersonNameForPerson(APerson candidate) { | |
95 int index = candidate.getStart(); | |
96 for (ScientificName name : names) { | |
97 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { | |
98 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); | |
99 return substring.length()/candidate.getLength(); | |
100 } | |
101 } | |
102 return 0; | |
103 } | |
104 | |
105 public float getSimilarityPersonNameForName(APerson candidate) { | |
106 int index = candidate.getStart(); | |
107 for (ScientificName name : names) { | |
108 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { | |
109 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), name.getReferenceInText()); | |
110 return substring.length()/name.getLength(); | |
111 } | |
112 } | |
113 return 0; | |
114 } | |
115 | |
116 public int doPersonAndNameStartAtSameIdx(APerson candidate) { | |
117 int index = candidate.getStart(); | |
118 for (ScientificName name : names) { | |
119 if (index >= name.getStart() && index < (name.getStart() + name.getLength())) { | |
120 if (index == name.getStart()) | |
121 return 1; | |
122 return 0; | |
123 } | |
124 } | |
125 return 0; | |
126 } | |
127 | |
128 public float getSimilarityPersonPlaceForPerson(APerson candidate) { | |
129 int index = candidate.getStart(); | |
130 for (Finding place : places) { | |
131 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { | |
132 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); | |
133 return substring.length()/candidate.getLength(); | |
134 } | |
135 } | |
136 return 0; | |
137 } | |
138 | |
139 public float getSimilarityPersonPlaceForPlace(APerson candidate) { | |
140 int index = candidate.getStart(); | |
141 for (Finding place : places) { | |
142 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { | |
143 String substring = SimilarityHelper.getBiggestSubstring(candidate.getReferenceInText(), place.getReferenceInText()); | |
144 return substring.length()/place.getLength(); | |
145 } | |
146 } | |
147 return 0; | |
148 } | |
149 | |
150 public int doPersonAndPlaceStartAtSameIdx(APerson candidate) { | |
151 int index = candidate.getStart(); | |
152 for (Finding place : places) { | |
153 if (index >= place.getStart() && index < (place.getStart() + place.getLength())) { | |
154 if (index == place.getStart()) | |
155 return 1; | |
156 return 0; | |
157 } | |
158 } | |
159 return 0; | |
160 } | |
161 | |
162 public int isSurroundedByBrackets(APerson candidate) { | |
163 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
164 if (sentence == null) | |
165 return 0; | |
166 | |
167 Pattern pattern = Pattern.compile("(.*?" + candidate.getReferenceInText().replace("(", "\\(").replace(")", "\\)") + ".*?)"); | |
168 Matcher matcher = pattern.matcher(sentence); | |
169 if (matcher.find()) | |
170 return 1; | |
171 return 0; | |
172 } | |
173 | |
174 public int isSurroundedByCommata(APerson candidate) { | |
175 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
176 if (sentence == null) | |
177 return 0; | |
178 | |
179 if (sentence.contains(", " + candidate.getReferenceInText() + ",")) | |
180 return 1; | |
181 return 0; | |
182 } | |
183 | |
184 public int isFollowedBy_s(APerson candidate) { | |
185 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
186 if (sentence == null) | |
187 return 0; | |
188 | |
189 if (sentence.contains(candidate.getReferenceInText() + "'s")) | |
190 return 1; | |
191 return 0; | |
192 } | |
193 | |
194 public void setPlaces(List<PlaceInformation> places) { | |
195 this.places = places; | |
196 } | |
197 | |
198 public List<PlaceInformation> getPlaces() { | |
199 return places; | |
200 } | |
201 } |