0
|
1 package de.mpiwg.anteater.persons.ml.preprocessing;
|
|
2
|
|
3 import java.util.ArrayList;
|
|
4 import java.util.List;
|
|
5
|
|
6 import de.mpiwg.anteater.AnteaterConfiguration;
|
|
7 import de.mpiwg.anteater.ml.ITextParser;
|
|
8 import de.mpiwg.anteater.ml.preprocessing.DataCreator;
|
|
9 import de.mpiwg.anteater.persons.APerson;
|
|
10 import de.mpiwg.anteater.persons.Location;
|
|
11 import de.mpiwg.anteater.persons.Organization;
|
|
12 import de.mpiwg.anteater.persons.Person;
|
|
13 import de.mpiwg.anteater.persons.PersonsExtraction;
|
|
14 import de.mpiwg.anteater.places.PlaceInformation;
|
|
15 import de.mpiwg.anteater.places.PlacesExtraction;
|
|
16 import de.mpiwg.anteater.species.scientific.ScientificName;
|
|
17 import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
|
|
18 import de.mpiwg.anteater.text.Paragraph;
|
|
19 import de.mpiwg.anteater.text.TextInformation;
|
|
20 import de.mpiwg.anteater.text.TextPart;
|
|
21
|
|
22 public class ApplicantDataCreator extends DataCreator {
|
|
23
|
|
24 public ApplicantDataCreator(AnteaterConfiguration configuration) {
|
|
25 super(configuration, "APPLICANT_");
|
|
26 }
|
|
27
|
|
28 @Override
|
|
29 public void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser) {
|
|
30 List<PersonsExtraction> results = info.getPersonsExtractions();
|
|
31
|
|
32 for (PersonsExtraction pResult : results) {
|
|
33 List<APerson> candidates = pResult.getPersons();
|
|
34 TextPart text = null;
|
|
35 switch(pResult.getType()) {
|
|
36 // summary
|
|
37 case 1: text = info.getSummaries().get(pResult.getTextIdx()); break;
|
|
38 // supplementary information
|
|
39 case 2: text = info.getSupplInfos().get(pResult.getTextIdx());
|
|
40 }
|
|
41
|
|
42 if (text == null)
|
|
43 continue;
|
|
44
|
|
45 List<String> sentences = new ArrayList<String>();
|
|
46
|
|
47 //ITextParser icuParser = new ICUTextParser();
|
|
48
|
|
49 long start = System.currentTimeMillis();
|
|
50 for (Paragraph p : text.getParagraphs()) {
|
|
51 sentences.addAll(textParser.getSentences(p.getParagraphText()));
|
|
52 }
|
|
53 long end = System.currentTimeMillis();
|
|
54 configuration.getLogger().logMessage(COMPONENT_NAME, "Splitting text into sentences: " + (end - start) + "ms");
|
|
55
|
|
56 ApplicantFeatureCalculator calculator = new ApplicantFeatureCalculator(sentences, textParser, text);
|
|
57 List<PlaceInformation> places = new ArrayList<PlaceInformation>();
|
|
58 List<ScientificName> names = new ArrayList<ScientificName>();
|
|
59
|
|
60 for (PlacesExtraction r : info.getPlacesExtractions()) {
|
|
61 if (r.getTextIdx() == pResult.getTextIdx())
|
|
62 places = r.getPlaceInformation();
|
|
63 }
|
|
64
|
|
65 for (ScientificNamesExtraction r : info.getScientificNamesExtractions()) {
|
|
66 if (r.getTextIdx() == pResult.getTextIdx())
|
|
67 names = r.getNames();
|
|
68 }
|
|
69 calculator.setNames(names);
|
|
70 calculator.setPlaces(places);
|
|
71
|
|
72 for (APerson candidate : candidates) {
|
|
73 StringBuffer dataPoint = new StringBuffer();
|
|
74
|
|
75 // unknown class
|
|
76 dataPoint.append(UNKNOWN_CLASS_SYMBOL);
|
|
77 dataPoint.append(",");
|
|
78
|
|
79 // add text type
|
|
80 dataPoint.append(pResult.getType());
|
|
81 dataPoint.append(",");
|
|
82 // name_length
|
|
83 dataPoint.append(candidate.getReferenceInText().length());
|
|
84 dataPoint.append(",");
|
|
85 // contains issued
|
|
86 dataPoint.append(calculator.getSentenceContainsIssued(candidate));
|
|
87 dataPoint.append(",");
|
|
88 // contains applied
|
|
89 dataPoint.append(calculator.getSentenceContainsApplied(candidate));
|
|
90 dataPoint.append(",");
|
|
91 // contains permit
|
|
92 dataPoint.append(calculator.getSentenceContainsPermit(candidate));
|
|
93 dataPoint.append(",");
|
|
94 // contains comment
|
|
95 dataPoint.append(calculator.getSentenceContainsComment(candidate));
|
|
96 dataPoint.append(",");
|
|
97 // is subject
|
|
98 start = System.currentTimeMillis();
|
|
99 dataPoint.append(calculator.getIsSubject(candidate));
|
|
100 dataPoint.append(",");
|
|
101 end = System.currentTimeMillis();
|
|
102 configuration.getLogger().logMessage(COMPONENT_NAME, "Determining subject: " + (end - start) + "ms");
|
|
103
|
|
104 // contains applicant
|
|
105 dataPoint.append(calculator.getSentenceContainsApplicant(candidate));
|
|
106 dataPoint.append(",");
|
|
107 // distance term to applicant
|
|
108 dataPoint.append(calculator.getDistanceCandidateToApplicant(candidate));
|
|
109 dataPoint.append(",");
|
|
110 // person, location, organization
|
|
111 if (candidate instanceof Person)
|
|
112 dataPoint.append(1);
|
|
113 else if (candidate instanceof Organization)
|
|
114 dataPoint.append(2);
|
|
115 else if (candidate instanceof Location)
|
|
116 dataPoint.append(3);
|
|
117 else
|
|
118 continue;
|
|
119 dataPoint.append(",");
|
|
120
|
|
121 // get similarity to speciies names
|
|
122 dataPoint.append(calculator.getSimilarityPersonNameForPerson(candidate));
|
|
123 dataPoint.append(",");
|
|
124 dataPoint.append(calculator.getSimilarityPersonNameForName(candidate));
|
|
125 dataPoint.append(",");
|
|
126 dataPoint.append(calculator.doPersonAndNameStartAtSameIdx(candidate));
|
|
127 dataPoint.append(",");
|
|
128
|
|
129 // get similarity to places names
|
|
130 dataPoint.append(calculator.getSimilarityPersonPlaceForPerson(candidate));
|
|
131 dataPoint.append(",");
|
|
132 dataPoint.append(calculator.getSimilarityPersonPlaceForPlace(candidate));
|
|
133 dataPoint.append(",");
|
|
134 dataPoint.append(calculator.doPersonAndPlaceStartAtSameIdx(candidate));
|
|
135 dataPoint.append(",");
|
|
136
|
|
137 // is surrounded by brackets
|
|
138 dataPoint.append(calculator.isSurroundedByBrackets(candidate));
|
|
139 dataPoint.append(",");
|
|
140
|
|
141 // is surrounded by commata
|
|
142 dataPoint.append(calculator.isSurroundedByCommata(candidate));
|
|
143 dataPoint.append(",");
|
|
144 // followed by 's
|
|
145 dataPoint.append(calculator.isFollowedBy_s(candidate));
|
|
146 dataPoint.append(",");
|
|
147
|
|
148 // is abbreviation
|
|
149 dataPoint.append(calculator.getIsAbbreviation(candidate));
|
|
150
|
|
151 dataPoint.append("\n");
|
|
152
|
|
153 arffContents.append(dataPoint);
|
|
154 }
|
|
155 }
|
|
156 }
|
|
157
|
|
158 }
|