0
|
1 package de.mpiwg.anteater.places.ml.preprocessing;
|
|
2
|
|
3 import java.util.HashMap;
|
|
4 import java.util.List;
|
|
5 import java.util.Map;
|
|
6 import java.util.regex.Matcher;
|
|
7 import java.util.regex.Pattern;
|
|
8
|
|
9 import de.mpiwg.anteater.ml.ITextParser;
|
|
10 import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
|
|
11 import de.mpiwg.anteater.persons.APerson;
|
|
12 import de.mpiwg.anteater.places.Place;
|
|
13 import de.mpiwg.anteater.places.PlaceInformation;
|
|
14 import de.mpiwg.anteater.species.scientific.ScientificName;
|
|
15 import de.mpiwg.anteater.text.Paragraph;
|
|
16 import de.mpiwg.anteater.text.TextPart;
|
|
17
|
|
18 public class LocationFeatureCalculator extends FeatureCalculator {
|
|
19
|
|
20 private final String _university = "university";
|
|
21 private final String _study = "study";
|
|
22 private final String _studies = "studies";
|
|
23 private final String _in = " in ";
|
|
24 private final String _at = " at ";
|
|
25 private final String _survey = "survey";
|
|
26 private final String _species = "species";
|
|
27
|
|
28 private List<ScientificName> foundNames;
|
|
29 private List<APerson> foundApplicants;
|
|
30 private TextPart text;
|
|
31 private Map<String, Integer> typeMap;
|
|
32
|
|
33 public LocationFeatureCalculator(List<String> sentenceList,
|
|
34 ITextParser parser, TextPart text) {
|
|
35 super(sentenceList, parser, text.getText());
|
|
36 this.text = text;
|
|
37
|
|
38 typeMap = new HashMap<String, Integer>();
|
|
39 typeMap.put(Place.TOWN, 1);
|
|
40 typeMap.put(Place.COUNTY, 2);
|
|
41 typeMap.put(Place.STATE, 3);
|
|
42 typeMap.put(Place.COUNTRY, 4);
|
|
43 typeMap.put(Place.SUBURB, 5);
|
|
44 typeMap.put(Place.POI, 6);
|
|
45 typeMap.put(Place.ZIP, 7);
|
|
46 typeMap.put(Place.OCEAN, 8);
|
|
47 }
|
|
48
|
|
49 public double getNumberWordRelation(String[] placeParts) {
|
|
50 int numbers = 0;
|
|
51 int words = placeParts.length;
|
|
52
|
|
53 for (String part : placeParts) {
|
|
54 if (part.matches("[0-9]{1}.*"))
|
|
55 numbers++;
|
|
56 }
|
|
57
|
|
58 return 1.0 * numbers / words;
|
|
59 }
|
|
60
|
|
61 public double getUppercasedWordsToAllRelation(String[] placeParts) {
|
|
62 int uppercaseWords = 0;
|
|
63 int words = placeParts.length;
|
|
64
|
|
65 for (String part : placeParts) {
|
|
66 if (part.matches("[A-Z]{1}.*"))
|
|
67 uppercaseWords++;
|
|
68 }
|
|
69
|
|
70 return 1.0 * uppercaseWords / words;
|
|
71 }
|
|
72
|
|
73 public int contains2UppercaseCharacterWord(String[] placeParts) {
|
|
74 for (String part : placeParts) {
|
|
75 if (part.matches("[A-Z][A-Z]"))
|
|
76 return 1;
|
|
77 }
|
|
78 return 0;
|
|
79 }
|
|
80
|
|
81 public int containsUniversity(String[] parts) {
|
|
82 for (String part : parts) {
|
|
83 if (part.trim().toLowerCase().equals(_university))
|
|
84 return 1;
|
|
85 }
|
|
86 return 0;
|
|
87 }
|
|
88
|
|
89 public int isPreceededByAnd(PlaceInformation candidate) {
|
|
90 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
91 if (sentence == null)
|
|
92 return 0;
|
|
93
|
|
94 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
|
|
95
|
|
96 if (sentence.substring(0, candidate.getStart() - offset).trim()
|
|
97 .endsWith("and"))
|
|
98 return 1;
|
|
99 return 0;
|
|
100 }
|
|
101
|
|
102 public int isPreceededByThe(PlaceInformation candidate) {
|
|
103 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
104 if (sentence == null)
|
|
105 return 0;
|
|
106
|
|
107 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
|
|
108
|
|
109 if (sentence.substring(0, candidate.getStart() - offset).trim()
|
|
110 .endsWith("the"))
|
|
111 return 1;
|
|
112 return 0;
|
|
113 }
|
|
114
|
|
115 public int isSurroundedByBrackets(PlaceInformation candidate) {
|
|
116 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
117 if (sentence == null)
|
|
118 return 0;
|
|
119
|
|
120 Pattern pattern = Pattern.compile("\\("
|
|
121 + candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)");
|
|
122 Matcher matcher = pattern.matcher(sentence);
|
|
123 if (matcher.find())
|
|
124 return 1;
|
|
125 return 0;
|
|
126 }
|
|
127
|
|
128 public int isSurroundedByCommata(PlaceInformation candidate) {
|
|
129 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
130 if (sentence == null)
|
|
131 return 0;
|
|
132
|
|
133 if (sentence.contains(", " + candidate.getReferenceInText() + ","))
|
|
134 return 1;
|
|
135 return 0;
|
|
136 }
|
|
137
|
|
138 public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) {
|
|
139
|
|
140 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
141 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
|
|
142
|
|
143 // find last species name
|
|
144 ScientificName closestName = null;
|
|
145 for (ScientificName name : foundNames) {
|
|
146 // if species is before candiddate
|
|
147 if (name.getStart() < candidate.getStart()
|
|
148 && name.getStart() > textcounter) {
|
|
149 // if it's closer than other last species replace lastName
|
|
150 if (closestName == null
|
|
151 || closestName.getStart() < name.getStart())
|
|
152 closestName = name;
|
|
153 }
|
|
154 }
|
|
155
|
|
156 if (closestName == null)
|
|
157 return -1;
|
|
158
|
|
159 return candidate.getStart() - (closestName.getStart() + closestName.getLength());
|
|
160 }
|
|
161
|
|
162 public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) {
|
|
163
|
|
164 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
165 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
|
|
166
|
|
167 // find last species name
|
|
168 ScientificName closestName = null;
|
|
169 for (ScientificName name : foundNames) {
|
|
170 // if species is before candiddate
|
|
171 if (name.getStart() > candidate.getStart() + candidate.getLength()
|
|
172 && name.getStart() < textcounter + paragraph.getParagraphText().length()) {
|
|
173 // if it's closer than other last species replace lastName
|
|
174 if (closestName == null
|
|
175 || closestName.getStart() > name.getStart())
|
|
176 closestName = name;
|
|
177 }
|
|
178 }
|
|
179
|
|
180 if (closestName == null)
|
|
181 return -1;
|
|
182
|
|
183 return closestName.getStart() - (candidate.getStart() + candidate.getLength());
|
|
184 }
|
|
185
|
|
186 public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) {
|
|
187
|
|
188 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
189 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
|
|
190
|
|
191 // find last species name
|
|
192 APerson closestApplicant = null;
|
|
193 for (APerson person : foundApplicants) {
|
|
194 // if species is before candiddate
|
|
195 if (person.getStart() < candidate.getStart()
|
|
196 && person.getStart() > textcounter) {
|
|
197 // if it's closer than other last species replace lastName
|
|
198 if (closestApplicant == null
|
|
199 || closestApplicant.getStart() < person.getStart())
|
|
200 closestApplicant = person;
|
|
201 }
|
|
202 }
|
|
203
|
|
204 if (closestApplicant == null)
|
|
205 return -1;
|
|
206
|
|
207 return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength());
|
|
208 }
|
|
209
|
|
210 public int getCharToStudyInParagraph(PlaceInformation candidate) {
|
|
211 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
212 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
|
|
213
|
|
214 return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText());
|
|
215 }
|
|
216
|
|
217 public int getCharToStudiesInParagraph(PlaceInformation candidate) {
|
|
218 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
219 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
|
|
220
|
|
221 return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText());
|
|
222 }
|
|
223
|
|
224 public int getCharToSurveyInSentence(PlaceInformation candidate) {
|
|
225 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
226
|
|
227 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence);
|
|
228 }
|
|
229
|
|
230 public int getCharToSpeciesInSentence(PlaceInformation candidate) {
|
|
231 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
232
|
|
233 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence);
|
|
234 }
|
|
235
|
|
236 public int hasComma(PlaceInformation candidate) {
|
|
237 if (candidate.getReferenceInText().contains(","))
|
|
238 return 1;
|
|
239 return 0;
|
|
240 }
|
|
241
|
|
242 public int hasBracket(PlaceInformation candidate) {
|
|
243 if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("("))
|
|
244 return 1;
|
|
245 return 0;
|
|
246 }
|
|
247
|
|
248 public int getCharToInInSentence(PlaceInformation candidate) {
|
|
249 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
250
|
|
251 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence);
|
|
252 }
|
|
253
|
|
254 public int getCharToAtInSentence(PlaceInformation candidate) {
|
|
255 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
256
|
|
257 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence);
|
|
258 }
|
|
259
|
|
260 private int getPosInSentence(PlaceInformation candidate) {
|
|
261 Paragraph paragraph = getParagraphOfCandidate(candidate);
|
|
262 String sentence = getSentenceContainingCandidate(candidate.getStart());
|
|
263 int indexOfSentence = paragraph.getParagraphText().indexOf(sentence);
|
|
264 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
|
|
265 return posInPara - indexOfSentence;
|
|
266 }
|
|
267
|
|
268 private Paragraph getParagraphOfCandidate(PlaceInformation candidate) {
|
|
269 int textcounter = 0;
|
|
270 Paragraph paragraph = null;
|
|
271 for (Paragraph para : text.getParagraphs()) {
|
|
272 if (candidate.getStart() >= textcounter
|
|
273 && candidate.getStart() < textcounter
|
|
274 + para.getParagraphText().length()) {
|
|
275 paragraph = para;
|
|
276 break;
|
|
277 }
|
|
278 textcounter += para.getParagraphText().length() + 1;
|
|
279 }
|
|
280 return paragraph;
|
|
281 }
|
|
282
|
|
283 public int getType(Place candidate) {
|
|
284 String type = candidate.getType();
|
|
285
|
|
286 if (typeMap.get(type) == null)
|
|
287 return 0;
|
|
288
|
|
289 return typeMap.get(type);
|
|
290 }
|
|
291
|
|
292 public List<ScientificName> getFoundSpecies() {
|
|
293 return foundNames;
|
|
294 }
|
|
295
|
|
296 public void setFoundSpecies(List<ScientificName> foundSpecies) {
|
|
297 this.foundNames = foundSpecies;
|
|
298 }
|
|
299
|
|
300 public List<APerson> getFoundApplicant() {
|
|
301 return foundApplicants;
|
|
302 }
|
|
303
|
|
304 public void setFoundApplicant(List<APerson> foundApplicant) {
|
|
305 this.foundApplicants = foundApplicant;
|
|
306 }
|
|
307 }
|