comparison src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:036535fcd179
1 package de.mpiwg.anteater.places.ml.preprocessing;
2
3 import java.util.HashMap;
4 import java.util.List;
5 import java.util.Map;
6 import java.util.regex.Matcher;
7 import java.util.regex.Pattern;
8
9 import de.mpiwg.anteater.ml.ITextParser;
10 import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator;
11 import de.mpiwg.anteater.persons.APerson;
12 import de.mpiwg.anteater.places.Place;
13 import de.mpiwg.anteater.places.PlaceInformation;
14 import de.mpiwg.anteater.species.scientific.ScientificName;
15 import de.mpiwg.anteater.text.Paragraph;
16 import de.mpiwg.anteater.text.TextPart;
17
18 public class LocationFeatureCalculator extends FeatureCalculator {
19
20 private final String _university = "university";
21 private final String _study = "study";
22 private final String _studies = "studies";
23 private final String _in = " in ";
24 private final String _at = " at ";
25 private final String _survey = "survey";
26 private final String _species = "species";
27
28 private List<ScientificName> foundNames;
29 private List<APerson> foundApplicants;
30 private TextPart text;
31 private Map<String, Integer> typeMap;
32
33 public LocationFeatureCalculator(List<String> sentenceList,
34 ITextParser parser, TextPart text) {
35 super(sentenceList, parser, text.getText());
36 this.text = text;
37
38 typeMap = new HashMap<String, Integer>();
39 typeMap.put(Place.TOWN, 1);
40 typeMap.put(Place.COUNTY, 2);
41 typeMap.put(Place.STATE, 3);
42 typeMap.put(Place.COUNTRY, 4);
43 typeMap.put(Place.SUBURB, 5);
44 typeMap.put(Place.POI, 6);
45 typeMap.put(Place.ZIP, 7);
46 typeMap.put(Place.OCEAN, 8);
47 }
48
49 public double getNumberWordRelation(String[] placeParts) {
50 int numbers = 0;
51 int words = placeParts.length;
52
53 for (String part : placeParts) {
54 if (part.matches("[0-9]{1}.*"))
55 numbers++;
56 }
57
58 return 1.0 * numbers / words;
59 }
60
61 public double getUppercasedWordsToAllRelation(String[] placeParts) {
62 int uppercaseWords = 0;
63 int words = placeParts.length;
64
65 for (String part : placeParts) {
66 if (part.matches("[A-Z]{1}.*"))
67 uppercaseWords++;
68 }
69
70 return 1.0 * uppercaseWords / words;
71 }
72
73 public int contains2UppercaseCharacterWord(String[] placeParts) {
74 for (String part : placeParts) {
75 if (part.matches("[A-Z][A-Z]"))
76 return 1;
77 }
78 return 0;
79 }
80
81 public int containsUniversity(String[] parts) {
82 for (String part : parts) {
83 if (part.trim().toLowerCase().equals(_university))
84 return 1;
85 }
86 return 0;
87 }
88
89 public int isPreceededByAnd(PlaceInformation candidate) {
90 String sentence = getSentenceContainingCandidate(candidate.getStart());
91 if (sentence == null)
92 return 0;
93
94 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
95
96 if (sentence.substring(0, candidate.getStart() - offset).trim()
97 .endsWith("and"))
98 return 1;
99 return 0;
100 }
101
102 public int isPreceededByThe(PlaceInformation candidate) {
103 String sentence = getSentenceContainingCandidate(candidate.getStart());
104 if (sentence == null)
105 return 0;
106
107 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart());
108
109 if (sentence.substring(0, candidate.getStart() - offset).trim()
110 .endsWith("the"))
111 return 1;
112 return 0;
113 }
114
115 public int isSurroundedByBrackets(PlaceInformation candidate) {
116 String sentence = getSentenceContainingCandidate(candidate.getStart());
117 if (sentence == null)
118 return 0;
119
120 Pattern pattern = Pattern.compile("\\("
121 + candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)");
122 Matcher matcher = pattern.matcher(sentence);
123 if (matcher.find())
124 return 1;
125 return 0;
126 }
127
128 public int isSurroundedByCommata(PlaceInformation candidate) {
129 String sentence = getSentenceContainingCandidate(candidate.getStart());
130 if (sentence == null)
131 return 0;
132
133 if (sentence.contains(", " + candidate.getReferenceInText() + ","))
134 return 1;
135 return 0;
136 }
137
138 public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) {
139
140 Paragraph paragraph = getParagraphOfCandidate(candidate);
141 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
142
143 // find last species name
144 ScientificName closestName = null;
145 for (ScientificName name : foundNames) {
146 // if species is before candiddate
147 if (name.getStart() < candidate.getStart()
148 && name.getStart() > textcounter) {
149 // if it's closer than other last species replace lastName
150 if (closestName == null
151 || closestName.getStart() < name.getStart())
152 closestName = name;
153 }
154 }
155
156 if (closestName == null)
157 return -1;
158
159 return candidate.getStart() - (closestName.getStart() + closestName.getLength());
160 }
161
162 public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) {
163
164 Paragraph paragraph = getParagraphOfCandidate(candidate);
165 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
166
167 // find last species name
168 ScientificName closestName = null;
169 for (ScientificName name : foundNames) {
170 // if species is before candiddate
171 if (name.getStart() > candidate.getStart() + candidate.getLength()
172 && name.getStart() < textcounter + paragraph.getParagraphText().length()) {
173 // if it's closer than other last species replace lastName
174 if (closestName == null
175 || closestName.getStart() > name.getStart())
176 closestName = name;
177 }
178 }
179
180 if (closestName == null)
181 return -1;
182
183 return closestName.getStart() - (candidate.getStart() + candidate.getLength());
184 }
185
186 public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) {
187
188 Paragraph paragraph = getParagraphOfCandidate(candidate);
189 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0);
190
191 // find last species name
192 APerson closestApplicant = null;
193 for (APerson person : foundApplicants) {
194 // if species is before candiddate
195 if (person.getStart() < candidate.getStart()
196 && person.getStart() > textcounter) {
197 // if it's closer than other last species replace lastName
198 if (closestApplicant == null
199 || closestApplicant.getStart() < person.getStart())
200 closestApplicant = person;
201 }
202 }
203
204 if (closestApplicant == null)
205 return -1;
206
207 return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength());
208 }
209
210 public int getCharToStudyInParagraph(PlaceInformation candidate) {
211 Paragraph paragraph = getParagraphOfCandidate(candidate);
212 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
213
214 return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText());
215 }
216
217 public int getCharToStudiesInParagraph(PlaceInformation candidate) {
218 Paragraph paragraph = getParagraphOfCandidate(candidate);
219 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
220
221 return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText());
222 }
223
224 public int getCharToSurveyInSentence(PlaceInformation candidate) {
225 String sentence = getSentenceContainingCandidate(candidate.getStart());
226
227 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence);
228 }
229
230 public int getCharToSpeciesInSentence(PlaceInformation candidate) {
231 String sentence = getSentenceContainingCandidate(candidate.getStart());
232
233 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence);
234 }
235
236 public int hasComma(PlaceInformation candidate) {
237 if (candidate.getReferenceInText().contains(","))
238 return 1;
239 return 0;
240 }
241
242 public int hasBracket(PlaceInformation candidate) {
243 if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("("))
244 return 1;
245 return 0;
246 }
247
248 public int getCharToInInSentence(PlaceInformation candidate) {
249 String sentence = getSentenceContainingCandidate(candidate.getStart());
250
251 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence);
252 }
253
254 public int getCharToAtInSentence(PlaceInformation candidate) {
255 String sentence = getSentenceContainingCandidate(candidate.getStart());
256
257 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence);
258 }
259
260 private int getPosInSentence(PlaceInformation candidate) {
261 Paragraph paragraph = getParagraphOfCandidate(candidate);
262 String sentence = getSentenceContainingCandidate(candidate.getStart());
263 int indexOfSentence = paragraph.getParagraphText().indexOf(sentence);
264 int posInPara = text.getPositionInParagraphFromText(candidate.getStart());
265 return posInPara - indexOfSentence;
266 }
267
268 private Paragraph getParagraphOfCandidate(PlaceInformation candidate) {
269 int textcounter = 0;
270 Paragraph paragraph = null;
271 for (Paragraph para : text.getParagraphs()) {
272 if (candidate.getStart() >= textcounter
273 && candidate.getStart() < textcounter
274 + para.getParagraphText().length()) {
275 paragraph = para;
276 break;
277 }
278 textcounter += para.getParagraphText().length() + 1;
279 }
280 return paragraph;
281 }
282
283 public int getType(Place candidate) {
284 String type = candidate.getType();
285
286 if (typeMap.get(type) == null)
287 return 0;
288
289 return typeMap.get(type);
290 }
291
292 public List<ScientificName> getFoundSpecies() {
293 return foundNames;
294 }
295
296 public void setFoundSpecies(List<ScientificName> foundSpecies) {
297 this.foundNames = foundSpecies;
298 }
299
300 public List<APerson> getFoundApplicant() {
301 return foundApplicants;
302 }
303
304 public void setFoundApplicant(List<APerson> foundApplicant) {
305 this.foundApplicants = foundApplicant;
306 }
307 }