annotate src/de/mpiwg/anteater/persons/ml/preprocessing/ApplicantDataCreator.java @ 10:70510ec97f4a default tip

annotate texts with results and build events with linnaeus
author jdamerow
date Mon, 19 Nov 2012 16:36:54 -0700
parents 036535fcd179
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.persons.ml.preprocessing;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import de.mpiwg.anteater.AnteaterConfiguration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import de.mpiwg.anteater.ml.ITextParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import de.mpiwg.anteater.ml.preprocessing.DataCreator;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import de.mpiwg.anteater.persons.APerson;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10 import de.mpiwg.anteater.persons.Location;
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.persons.Organization;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import de.mpiwg.anteater.persons.Person;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.persons.PersonsExtraction;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import de.mpiwg.anteater.places.PlaceInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import de.mpiwg.anteater.places.PlacesExtraction;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 import de.mpiwg.anteater.species.scientific.ScientificName;
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 import de.mpiwg.anteater.text.Paragraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 import de.mpiwg.anteater.text.TextInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 import de.mpiwg.anteater.text.TextPart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
21
036535fcd179 anteater
jdamerow
parents:
diff changeset
22 public class ApplicantDataCreator extends DataCreator {
036535fcd179 anteater
jdamerow
parents:
diff changeset
23
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 public ApplicantDataCreator(AnteaterConfiguration configuration) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 super(configuration, "APPLICANT_");
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
27
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 List<PersonsExtraction> results = info.getPersonsExtractions();
036535fcd179 anteater
jdamerow
parents:
diff changeset
31
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 for (PersonsExtraction pResult : results) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
33 List<APerson> candidates = pResult.getPersons();
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 TextPart text = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 switch(pResult.getType()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 // summary
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 case 1: text = info.getSummaries().get(pResult.getTextIdx()); break;
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 // supplementary information
036535fcd179 anteater
jdamerow
parents:
diff changeset
39 case 2: text = info.getSupplInfos().get(pResult.getTextIdx());
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
41
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 if (text == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
44
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 List<String> sentences = new ArrayList<String>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
46
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 //ITextParser icuParser = new ICUTextParser();
036535fcd179 anteater
jdamerow
parents:
diff changeset
48
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 long start = System.currentTimeMillis();
036535fcd179 anteater
jdamerow
parents:
diff changeset
50 for (Paragraph p : text.getParagraphs()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 sentences.addAll(textParser.getSentences(p.getParagraphText()));
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 long end = System.currentTimeMillis();
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
036535fcd179 anteater
jdamerow
parents:
diff changeset
55
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text);
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 List<PlaceInformation> places = new ArrayList<PlaceInformation>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 List<ScientificName> names = new ArrayList<ScientificName>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
59
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 for (PlacesExtraction r : info.getPlacesExtractions()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 if (r.getTextIdx() == pResult.getTextIdx())
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 places = r.getPlaceInformation();
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65 for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 if (r.getTextIdx() == pResult.getTextIdx())
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 names = r.getNames();
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 calculator.setNames(names);
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 calculator.setPlaces(places);
036535fcd179 anteater
jdamerow
parents:
diff changeset
71
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 for (APerson candidate : candidates) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 StringBuffer dataPoint = new StringBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
74
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 // unknown class
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 dataPoint.append(UNKNOWN_CLASS_SYMBOL);
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
78
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 // add text type
036535fcd179 anteater
jdamerow
parents:
diff changeset
80 dataPoint.append(pResult.getType());
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 // name_length
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 dataPoint.append(candidate.getReferenceInText().length());
036535fcd179 anteater
jdamerow
parents:
diff changeset
84 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 // contains issued
036535fcd179 anteater
jdamerow
parents:
diff changeset
86 dataPoint.append(calculator.getSentenceContainsIssued(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 // contains applied
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 dataPoint.append(calculator.getSentenceContainsApplied(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 // contains permit
036535fcd179 anteater
jdamerow
parents:
diff changeset
92 dataPoint.append(calculator.getSentenceContainsPermit(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
94 // contains comment
036535fcd179 anteater
jdamerow
parents:
diff changeset
95 dataPoint.append(calculator.getSentenceContainsComment(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 // is subject
036535fcd179 anteater
jdamerow
parents:
diff changeset
98 start = System.currentTimeMillis();
036535fcd179 anteater
jdamerow
parents:
diff changeset
99 dataPoint.append(calculator.getIsSubject(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
100 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
101 end = System.currentTimeMillis();
036535fcd179 anteater
jdamerow
parents:
diff changeset
102 configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms");
036535fcd179 anteater
jdamerow
parents:
diff changeset
103
036535fcd179 anteater
jdamerow
parents:
diff changeset
104 // contains applicant
036535fcd179 anteater
jdamerow
parents:
diff changeset
105 dataPoint.append(calculator.getSentenceContainsApplicant(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
106 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
107 // distance term to applicant
036535fcd179 anteater
jdamerow
parents:
diff changeset
108 dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
109 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
110 // person, location, organization
036535fcd179 anteater
jdamerow
parents:
diff changeset
111 if (candidate instanceof Person)
036535fcd179 anteater
jdamerow
parents:
diff changeset
112 dataPoint.append(1);
036535fcd179 anteater
jdamerow
parents:
diff changeset
113 else if (candidate instanceof Organization)
036535fcd179 anteater
jdamerow
parents:
diff changeset
114 dataPoint.append(2);
036535fcd179 anteater
jdamerow
parents:
diff changeset
115 else if (candidate instanceof Location)
036535fcd179 anteater
jdamerow
parents:
diff changeset
116 dataPoint.append(3);
036535fcd179 anteater
jdamerow
parents:
diff changeset
117 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
118 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
119 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
120
036535fcd179 anteater
jdamerow
parents:
diff changeset
121 // get similarity to speciies names
036535fcd179 anteater
jdamerow
parents:
diff changeset
122 dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
123 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
124 dataPoint.append(calculator.getSimilarityPersonNameForName(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
125 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
126 dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
127 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
128
036535fcd179 anteater
jdamerow
parents:
diff changeset
129 // get similarity to places names
036535fcd179 anteater
jdamerow
parents:
diff changeset
130 dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
131 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
132 dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
133 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
134 dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
135 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
136
036535fcd179 anteater
jdamerow
parents:
diff changeset
137 // is surrounded by brackets
036535fcd179 anteater
jdamerow
parents:
diff changeset
138 dataPoint.append(calculator.isSurroundedByBrackets(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
139 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
140
036535fcd179 anteater
jdamerow
parents:
diff changeset
141 // is surrounded by commata
036535fcd179 anteater
jdamerow
parents:
diff changeset
142 dataPoint.append(calculator.isSurroundedByCommata(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
143 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
144 // followed by 's
036535fcd179 anteater
jdamerow
parents:
diff changeset
145 dataPoint.append(calculator.isFollowedBy_s(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
146 dataPoint.append(",");
036535fcd179 anteater
jdamerow
parents:
diff changeset
147
036535fcd179 anteater
jdamerow
parents:
diff changeset
148 // is abbreviation
036535fcd179 anteater
jdamerow
parents:
diff changeset
149 dataPoint.append(calculator.getIsAbbreviation(candidate));
036535fcd179 anteater
jdamerow
parents:
diff changeset
150
036535fcd179 anteater
jdamerow
parents:
diff changeset
151 dataPoint.append("\n");
036535fcd179 anteater
jdamerow
parents:
diff changeset
152
036535fcd179 anteater
jdamerow
parents:
diff changeset
153 arffContents.append(dataPoint);
036535fcd179 anteater
jdamerow
parents:
diff changeset
154 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
155 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
156 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
157
036535fcd179 anteater
jdamerow
parents:
diff changeset
158 }