Mercurial > hg > anteater
comparison src/de/mpiwg/anteater/places/ml/preprocessing/LocationFeatureCalculator.java @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:036535fcd179 |
---|---|
1 package de.mpiwg.anteater.places.ml.preprocessing; | |
2 | |
3 import java.util.HashMap; | |
4 import java.util.List; | |
5 import java.util.Map; | |
6 import java.util.regex.Matcher; | |
7 import java.util.regex.Pattern; | |
8 | |
9 import de.mpiwg.anteater.ml.ITextParser; | |
10 import de.mpiwg.anteater.ml.preprocessing.FeatureCalculator; | |
11 import de.mpiwg.anteater.persons.APerson; | |
12 import de.mpiwg.anteater.places.Place; | |
13 import de.mpiwg.anteater.places.PlaceInformation; | |
14 import de.mpiwg.anteater.species.scientific.ScientificName; | |
15 import de.mpiwg.anteater.text.Paragraph; | |
16 import de.mpiwg.anteater.text.TextPart; | |
17 | |
18 public class LocationFeatureCalculator extends FeatureCalculator { | |
19 | |
20 private final String _university = "university"; | |
21 private final String _study = "study"; | |
22 private final String _studies = "studies"; | |
23 private final String _in = " in "; | |
24 private final String _at = " at "; | |
25 private final String _survey = "survey"; | |
26 private final String _species = "species"; | |
27 | |
28 private List<ScientificName> foundNames; | |
29 private List<APerson> foundApplicants; | |
30 private TextPart text; | |
31 private Map<String, Integer> typeMap; | |
32 | |
33 public LocationFeatureCalculator(List<String> sentenceList, | |
34 ITextParser parser, TextPart text) { | |
35 super(sentenceList, parser, text.getText()); | |
36 this.text = text; | |
37 | |
38 typeMap = new HashMap<String, Integer>(); | |
39 typeMap.put(Place.TOWN, 1); | |
40 typeMap.put(Place.COUNTY, 2); | |
41 typeMap.put(Place.STATE, 3); | |
42 typeMap.put(Place.COUNTRY, 4); | |
43 typeMap.put(Place.SUBURB, 5); | |
44 typeMap.put(Place.POI, 6); | |
45 typeMap.put(Place.ZIP, 7); | |
46 typeMap.put(Place.OCEAN, 8); | |
47 } | |
48 | |
49 public double getNumberWordRelation(String[] placeParts) { | |
50 int numbers = 0; | |
51 int words = placeParts.length; | |
52 | |
53 for (String part : placeParts) { | |
54 if (part.matches("[0-9]{1}.*")) | |
55 numbers++; | |
56 } | |
57 | |
58 return 1.0 * numbers / words; | |
59 } | |
60 | |
61 public double getUppercasedWordsToAllRelation(String[] placeParts) { | |
62 int uppercaseWords = 0; | |
63 int words = placeParts.length; | |
64 | |
65 for (String part : placeParts) { | |
66 if (part.matches("[A-Z]{1}.*")) | |
67 uppercaseWords++; | |
68 } | |
69 | |
70 return 1.0 * uppercaseWords / words; | |
71 } | |
72 | |
73 public int contains2UppercaseCharacterWord(String[] placeParts) { | |
74 for (String part : placeParts) { | |
75 if (part.matches("[A-Z][A-Z]")) | |
76 return 1; | |
77 } | |
78 return 0; | |
79 } | |
80 | |
81 public int containsUniversity(String[] parts) { | |
82 for (String part : parts) { | |
83 if (part.trim().toLowerCase().equals(_university)) | |
84 return 1; | |
85 } | |
86 return 0; | |
87 } | |
88 | |
89 public int isPreceededByAnd(PlaceInformation candidate) { | |
90 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
91 if (sentence == null) | |
92 return 0; | |
93 | |
94 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); | |
95 | |
96 if (sentence.substring(0, candidate.getStart() - offset).trim() | |
97 .endsWith("and")) | |
98 return 1; | |
99 return 0; | |
100 } | |
101 | |
102 public int isPreceededByThe(PlaceInformation candidate) { | |
103 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
104 if (sentence == null) | |
105 return 0; | |
106 | |
107 int offset = getStartOfSentenceContainingCandidiate(candidate.getStart()); | |
108 | |
109 if (sentence.substring(0, candidate.getStart() - offset).trim() | |
110 .endsWith("the")) | |
111 return 1; | |
112 return 0; | |
113 } | |
114 | |
115 public int isSurroundedByBrackets(PlaceInformation candidate) { | |
116 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
117 if (sentence == null) | |
118 return 0; | |
119 | |
120 Pattern pattern = Pattern.compile("\\(" | |
121 + candidate.getReferenceInText().replace("(", "\\(").replace(")","\\)") + "\\)"); | |
122 Matcher matcher = pattern.matcher(sentence); | |
123 if (matcher.find()) | |
124 return 1; | |
125 return 0; | |
126 } | |
127 | |
128 public int isSurroundedByCommata(PlaceInformation candidate) { | |
129 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
130 if (sentence == null) | |
131 return 0; | |
132 | |
133 if (sentence.contains(", " + candidate.getReferenceInText() + ",")) | |
134 return 1; | |
135 return 0; | |
136 } | |
137 | |
138 public int getCharsToLastSpeciesInParagraph(PlaceInformation candidate) { | |
139 | |
140 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
141 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); | |
142 | |
143 // find last species name | |
144 ScientificName closestName = null; | |
145 for (ScientificName name : foundNames) { | |
146 // if species is before candiddate | |
147 if (name.getStart() < candidate.getStart() | |
148 && name.getStart() > textcounter) { | |
149 // if it's closer than other last species replace lastName | |
150 if (closestName == null | |
151 || closestName.getStart() < name.getStart()) | |
152 closestName = name; | |
153 } | |
154 } | |
155 | |
156 if (closestName == null) | |
157 return -1; | |
158 | |
159 return candidate.getStart() - (closestName.getStart() + closestName.getLength()); | |
160 } | |
161 | |
162 public int getCharsToNextSpeciesInParagraph(PlaceInformation candidate) { | |
163 | |
164 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
165 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); | |
166 | |
167 // find last species name | |
168 ScientificName closestName = null; | |
169 for (ScientificName name : foundNames) { | |
170 // if species is before candiddate | |
171 if (name.getStart() > candidate.getStart() + candidate.getLength() | |
172 && name.getStart() < textcounter + paragraph.getParagraphText().length()) { | |
173 // if it's closer than other last species replace lastName | |
174 if (closestName == null | |
175 || closestName.getStart() > name.getStart()) | |
176 closestName = name; | |
177 } | |
178 } | |
179 | |
180 if (closestName == null) | |
181 return -1; | |
182 | |
183 return closestName.getStart() - (candidate.getStart() + candidate.getLength()); | |
184 } | |
185 | |
186 public int getCharsToLastApplicantInParagraph(PlaceInformation candidate) { | |
187 | |
188 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
189 int textcounter = text.getPositionInTextFromParagraph(paragraph, 0); | |
190 | |
191 // find last species name | |
192 APerson closestApplicant = null; | |
193 for (APerson person : foundApplicants) { | |
194 // if species is before candiddate | |
195 if (person.getStart() < candidate.getStart() | |
196 && person.getStart() > textcounter) { | |
197 // if it's closer than other last species replace lastName | |
198 if (closestApplicant == null | |
199 || closestApplicant.getStart() < person.getStart()) | |
200 closestApplicant = person; | |
201 } | |
202 } | |
203 | |
204 if (closestApplicant == null) | |
205 return -1; | |
206 | |
207 return candidate.getStart() - (closestApplicant.getStart() + closestApplicant.getLength()); | |
208 } | |
209 | |
210 public int getCharToStudyInParagraph(PlaceInformation candidate) { | |
211 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
212 int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); | |
213 | |
214 return getOffsetToClosestWord(posInPara, candidate.getLength(), _study, paragraph.getParagraphText()); | |
215 } | |
216 | |
217 public int getCharToStudiesInParagraph(PlaceInformation candidate) { | |
218 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
219 int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); | |
220 | |
221 return getOffsetToClosestWord(posInPara, candidate.getLength(), _studies, paragraph.getParagraphText()); | |
222 } | |
223 | |
224 public int getCharToSurveyInSentence(PlaceInformation candidate) { | |
225 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
226 | |
227 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _survey, sentence); | |
228 } | |
229 | |
230 public int getCharToSpeciesInSentence(PlaceInformation candidate) { | |
231 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
232 | |
233 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _species, sentence); | |
234 } | |
235 | |
236 public int hasComma(PlaceInformation candidate) { | |
237 if (candidate.getReferenceInText().contains(",")) | |
238 return 1; | |
239 return 0; | |
240 } | |
241 | |
242 public int hasBracket(PlaceInformation candidate) { | |
243 if (candidate.getReferenceInText().contains(")") || candidate.getReferenceInText().contains("(")) | |
244 return 1; | |
245 return 0; | |
246 } | |
247 | |
248 public int getCharToInInSentence(PlaceInformation candidate) { | |
249 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
250 | |
251 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _in, sentence); | |
252 } | |
253 | |
254 public int getCharToAtInSentence(PlaceInformation candidate) { | |
255 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
256 | |
257 return getOffsetToClosestWord(getPosInSentence(candidate), candidate.getLength(), _at, sentence); | |
258 } | |
259 | |
260 private int getPosInSentence(PlaceInformation candidate) { | |
261 Paragraph paragraph = getParagraphOfCandidate(candidate); | |
262 String sentence = getSentenceContainingCandidate(candidate.getStart()); | |
263 int indexOfSentence = paragraph.getParagraphText().indexOf(sentence); | |
264 int posInPara = text.getPositionInParagraphFromText(candidate.getStart()); | |
265 return posInPara - indexOfSentence; | |
266 } | |
267 | |
268 private Paragraph getParagraphOfCandidate(PlaceInformation candidate) { | |
269 int textcounter = 0; | |
270 Paragraph paragraph = null; | |
271 for (Paragraph para : text.getParagraphs()) { | |
272 if (candidate.getStart() >= textcounter | |
273 && candidate.getStart() < textcounter | |
274 + para.getParagraphText().length()) { | |
275 paragraph = para; | |
276 break; | |
277 } | |
278 textcounter += para.getParagraphText().length() + 1; | |
279 } | |
280 return paragraph; | |
281 } | |
282 | |
283 public int getType(Place candidate) { | |
284 String type = candidate.getType(); | |
285 | |
286 if (typeMap.get(type) == null) | |
287 return 0; | |
288 | |
289 return typeMap.get(type); | |
290 } | |
291 | |
292 public List<ScientificName> getFoundSpecies() { | |
293 return foundNames; | |
294 } | |
295 | |
296 public void setFoundSpecies(List<ScientificName> foundSpecies) { | |
297 this.foundNames = foundSpecies; | |
298 } | |
299 | |
300 public List<APerson> getFoundApplicant() { | |
301 return foundApplicants; | |
302 } | |
303 | |
304 public void setFoundApplicant(List<APerson> foundApplicant) { | |
305 this.foundApplicants = foundApplicant; | |
306 } | |
307 } |