annotate src/de/mpiwg/anteater/results/ResultController.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.results;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.io.File;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.util.ArrayList;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.util.Collections;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.util.Comparator;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import java.util.List;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8 import java.util.Map;
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import java.util.Stack;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import org.apache.commons.lang3.StringEscapeUtils;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.AnteaterConfiguration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14 import de.mpiwg.anteater.results.filter.FilterController;
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 import de.mpiwg.anteater.results.impl.ApplicantResultFinder;
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 import de.mpiwg.anteater.results.impl.ApplicantResultManager;
036535fcd179 anteater
jdamerow
parents:
diff changeset
17 import de.mpiwg.anteater.results.impl.LocationResultFinder;
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 import de.mpiwg.anteater.results.impl.LocationResultManager;
036535fcd179 anteater
jdamerow
parents:
diff changeset
19 import de.mpiwg.anteater.results.impl.ScientificNameResultFinder;
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 import de.mpiwg.anteater.results.impl.ScientificNameResultManager;
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 import de.mpiwg.anteater.text.Paragraph;
036535fcd179 anteater
jdamerow
parents:
diff changeset
22 import de.mpiwg.anteater.text.TextInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 import de.mpiwg.anteater.text.TextPart;
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 import de.mpiwg.anteater.xml.IResultFileManager;
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 import de.mpiwg.anteater.xml.impl.ResultXMLManager;
036535fcd179 anteater
jdamerow
parents:
diff changeset
26
036535fcd179 anteater
jdamerow
parents:
diff changeset
27 public class ResultController {
036535fcd179 anteater
jdamerow
parents:
diff changeset
28
036535fcd179 anteater
jdamerow
parents:
diff changeset
29 public final static String SUMMARY_TAG = "<summary>";
036535fcd179 anteater
jdamerow
parents:
diff changeset
30 public final static String SUMMARY_TAG_CLOSE = "</summary>";
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 public final static String SUPPLINFO_TAG = "<supplInfo>";
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 public final static String SUPPLINFO_TAG_CLOSE = "</supplInfo>";
036535fcd179 anteater
jdamerow
parents:
diff changeset
33
036535fcd179 anteater
jdamerow
parents:
diff changeset
34 public final static String COMPONENT_NAME = ResultController.class.getSimpleName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
35
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 private AnteaterConfiguration configuration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
37
036535fcd179 anteater
jdamerow
parents:
diff changeset
38 public ResultController(AnteaterConfiguration configuration) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
39 this.configuration = configuration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
41
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 public List<ResultsCarrier> saveResults(List<TextInformation> infos) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
43
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 String resultFolder = configuration.getResultPath();
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 List<ResultsCarrier> resultCarriers = new ArrayList<ResultsCarrier>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
46
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 ApplicantResultManager applicantManager = new ApplicantResultManager(configuration);
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 ScientificNameResultManager speciesManager = new ScientificNameResultManager(configuration);
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 LocationResultManager locationManager = new LocationResultManager(configuration);
036535fcd179 anteater
jdamerow
parents:
diff changeset
50
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 // find applicants
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 IResultFinder<ApplicantResult> applicantsFinder = new ApplicantResultFinder(configuration);
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 List<ApplicantResult> applicants = applicantsFinder.getResults(infos);
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 Map<TextInformation, List<ApplicantResult>> sortedApplicants = applicantManager.sortResultsByText(applicants);
036535fcd179 anteater
jdamerow
parents:
diff changeset
55
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 // find species
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 IResultFinder<SpeciesScientificResult> namesFinder = new ScientificNameResultFinder();
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 List<SpeciesScientificResult> names = namesFinder.getResults(infos);
036535fcd179 anteater
jdamerow
parents:
diff changeset
59 Map<TextInformation, List<SpeciesScientificResult>> sortedNames = speciesManager.sortResultsByText(names);
036535fcd179 anteater
jdamerow
parents:
diff changeset
60
036535fcd179 anteater
jdamerow
parents:
diff changeset
61 // find locations
036535fcd179 anteater
jdamerow
parents:
diff changeset
62 IResultFinder<LocationResult> locationFinder = new LocationResultFinder(configuration, names, applicants);
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 List<LocationResult> locations = locationFinder.getResults(infos);
036535fcd179 anteater
jdamerow
parents:
diff changeset
64 Map<TextInformation, List<LocationResult>> sortedPlaces = locationManager.sortResultsByText(locations);
036535fcd179 anteater
jdamerow
parents:
diff changeset
65
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 FilterController filterController = new FilterController();
036535fcd179 anteater
jdamerow
parents:
diff changeset
67
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 for (TextInformation info : infos) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 File textfile = new File(info.getFilepath());
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 String resultFile = resultFolder + File.separator + textfile.getName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 IResultFileManager resultManager = new ResultXMLManager(resultFile);
036535fcd179 anteater
jdamerow
parents:
diff changeset
72
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 // annotate summaries
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 List<TextPart> summaries = info.getSummaries();
036535fcd179 anteater
jdamerow
parents:
diff changeset
75 List<ApplicantResult> applicantResults = applicantManager.getPredictedResults(sortedApplicants.get(info));
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 List<SpeciesScientificResult> namesResults = speciesManager.getPredictedResults(sortedNames.get(info));
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 List<LocationResult> locResults = locationManager.getPredictedResults(sortedPlaces.get(info));
036535fcd179 anteater
jdamerow
parents:
diff changeset
78
036535fcd179 anteater
jdamerow
parents:
diff changeset
79 filterController.runFilters(info, applicantResults, namesResults, locResults);
036535fcd179 anteater
jdamerow
parents:
diff changeset
80
036535fcd179 anteater
jdamerow
parents:
diff changeset
81 // create carrier
036535fcd179 anteater
jdamerow
parents:
diff changeset
82 ResultsCarrier carrier = new ResultsCarrier(info, applicantResults, namesResults, locResults);
036535fcd179 anteater
jdamerow
parents:
diff changeset
83 resultCarriers.add(carrier);
036535fcd179 anteater
jdamerow
parents:
diff changeset
84
036535fcd179 anteater
jdamerow
parents:
diff changeset
85 for (TextPart sum : summaries) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
86
036535fcd179 anteater
jdamerow
parents:
diff changeset
87 int textIndex = sum.getTextIdx();
036535fcd179 anteater
jdamerow
parents:
diff changeset
88 List<AnnotationTag> tags = applicantManager.getSummaryTags(applicantResults, textIndex);
036535fcd179 anteater
jdamerow
parents:
diff changeset
89 tags.addAll(speciesManager.getSummaryTags(namesResults, textIndex));
036535fcd179 anteater
jdamerow
parents:
diff changeset
90 tags.addAll(locationManager.getSummaryTags(locResults, textIndex));
036535fcd179 anteater
jdamerow
parents:
diff changeset
91 tags.addAll(getParagraphTags(sum));
036535fcd179 anteater
jdamerow
parents:
diff changeset
92
036535fcd179 anteater
jdamerow
parents:
diff changeset
93 StringBuffer annotatedSummary = new StringBuffer(sum.getText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
94 annotatedSummary = tagText(annotatedSummary, tags);
036535fcd179 anteater
jdamerow
parents:
diff changeset
95
036535fcd179 anteater
jdamerow
parents:
diff changeset
96 resultManager.addSummary(SUMMARY_TAG + annotatedSummary.toString() + SUMMARY_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
97 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
98
036535fcd179 anteater
jdamerow
parents:
diff changeset
99 // annotate supplementary information
036535fcd179 anteater
jdamerow
parents:
diff changeset
100 List<TextPart> suppleInfs = info.getSupplInfos();
036535fcd179 anteater
jdamerow
parents:
diff changeset
101
036535fcd179 anteater
jdamerow
parents:
diff changeset
102 for (TextPart suppleInf : suppleInfs) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
103 int textIndex = suppleInf.getTextIdx();
036535fcd179 anteater
jdamerow
parents:
diff changeset
104 List<AnnotationTag> tags = applicantManager.getSuppleInfTags(applicantResults, textIndex);
036535fcd179 anteater
jdamerow
parents:
diff changeset
105 tags.addAll(speciesManager.getSuppleInfTags(namesResults, textIndex));
036535fcd179 anteater
jdamerow
parents:
diff changeset
106 tags.addAll(locationManager.getSuppleInfTags(locResults, textIndex));
036535fcd179 anteater
jdamerow
parents:
diff changeset
107 tags.addAll(getParagraphTags(suppleInf));
036535fcd179 anteater
jdamerow
parents:
diff changeset
108
036535fcd179 anteater
jdamerow
parents:
diff changeset
109 StringBuffer annotatedSuppleInf = new StringBuffer(suppleInf.getText());
036535fcd179 anteater
jdamerow
parents:
diff changeset
110 annotatedSuppleInf = tagText(annotatedSuppleInf, tags);
036535fcd179 anteater
jdamerow
parents:
diff changeset
111
036535fcd179 anteater
jdamerow
parents:
diff changeset
112 resultManager.addSupplInf(SUPPLINFO_TAG + annotatedSuppleInf.toString() + SUPPLINFO_TAG_CLOSE);
036535fcd179 anteater
jdamerow
parents:
diff changeset
113 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
114 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
115
036535fcd179 anteater
jdamerow
parents:
diff changeset
116 return resultCarriers;
036535fcd179 anteater
jdamerow
parents:
diff changeset
117 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
118
036535fcd179 anteater
jdamerow
parents:
diff changeset
119
036535fcd179 anteater
jdamerow
parents:
diff changeset
120 protected void checkTags(List<AnnotationTag> tags) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
121 AnnotationTag lastTag = null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
122
036535fcd179 anteater
jdamerow
parents:
diff changeset
123 List<AnnotationTag> toBeRemoved = new ArrayList<AnnotationTag>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
124 for (AnnotationTag tag : tags) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
125 if (lastTag == null)
036535fcd179 anteater
jdamerow
parents:
diff changeset
126 {
036535fcd179 anteater
jdamerow
parents:
diff changeset
127 lastTag = tag;
036535fcd179 anteater
jdamerow
parents:
diff changeset
128 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
129 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
130 if ((tag.getClosingPosition() < lastTag.getPosition()) || (tag.getPosition() >= lastTag.getPosition() && tag.getClosingPosition() <= lastTag.getClosingPosition())) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
131 lastTag = tag;
036535fcd179 anteater
jdamerow
parents:
diff changeset
132 continue;
036535fcd179 anteater
jdamerow
parents:
diff changeset
133 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
134 if (tag.getTag().startsWith("<p"))
036535fcd179 anteater
jdamerow
parents:
diff changeset
135 toBeRemoved.add(lastTag);
036535fcd179 anteater
jdamerow
parents:
diff changeset
136 else {
036535fcd179 anteater
jdamerow
parents:
diff changeset
137 lastTag.setTag("<error type=\"" + lastTag.getType() + "\">");
036535fcd179 anteater
jdamerow
parents:
diff changeset
138 lastTag.setClosingTag("</error>");
036535fcd179 anteater
jdamerow
parents:
diff changeset
139 lastTag.setPosition(tag.getClosingPosition());
036535fcd179 anteater
jdamerow
parents:
diff changeset
140 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
141 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
142
036535fcd179 anteater
jdamerow
parents:
diff changeset
143 tags.removeAll(toBeRemoved);
036535fcd179 anteater
jdamerow
parents:
diff changeset
144 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
145
036535fcd179 anteater
jdamerow
parents:
diff changeset
146 protected StringBuffer tagText(StringBuffer text, List<AnnotationTag> tags) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
147 Collections.sort(tags, new Comparator<AnnotationTag>() {
036535fcd179 anteater
jdamerow
parents:
diff changeset
148
036535fcd179 anteater
jdamerow
parents:
diff changeset
149 @Override
036535fcd179 anteater
jdamerow
parents:
diff changeset
150 public int compare(AnnotationTag o1, AnnotationTag o2) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
151 if (o2.getClosingPosition() - o1.getClosingPosition() != 0)
036535fcd179 anteater
jdamerow
parents:
diff changeset
152 return o2.getClosingPosition() - o1.getClosingPosition();
036535fcd179 anteater
jdamerow
parents:
diff changeset
153 return o1.getPosition() - o2.getPosition();
036535fcd179 anteater
jdamerow
parents:
diff changeset
154 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
155 });
036535fcd179 anteater
jdamerow
parents:
diff changeset
156
036535fcd179 anteater
jdamerow
parents:
diff changeset
157 checkTags(tags);
036535fcd179 anteater
jdamerow
parents:
diff changeset
158
036535fcd179 anteater
jdamerow
parents:
diff changeset
159 Stack<AnnotationTag> tagStack = new Stack<AnnotationTag>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
160 StringBuffer finalText = new StringBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
161
036535fcd179 anteater
jdamerow
parents:
diff changeset
162 int end = text.length();
036535fcd179 anteater
jdamerow
parents:
diff changeset
163 for (AnnotationTag tag : tags) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
164 if (text.length() >= tag.getPosition()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
165 while (!tagStack.isEmpty() && tagStack.peek().getPosition() >= tag.getClosingPosition()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
166 AnnotationTag topTag = tagStack.pop();
036535fcd179 anteater
jdamerow
parents:
diff changeset
167 if (topTag.getPosition() < end) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
168 finalText.insert(0, StringEscapeUtils.escapeXml(text.substring(topTag.getPosition(), end)));
036535fcd179 anteater
jdamerow
parents:
diff changeset
169 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
170 finalText.insert(0, topTag.getTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
171 end = topTag.getPosition();
036535fcd179 anteater
jdamerow
parents:
diff changeset
172 // text.insert(topTag.getPosition(), topTag.getTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
173 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
174 if (tag.getClosingPosition() < end) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
175 finalText.insert(0, StringEscapeUtils.escapeXml(text.substring(tag.getClosingPosition(), end)));
036535fcd179 anteater
jdamerow
parents:
diff changeset
176 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
177 finalText.insert(0, tag.getClosingTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
178 end = tag.getClosingPosition();
036535fcd179 anteater
jdamerow
parents:
diff changeset
179 //text.insert(tag.getClosingPosition(), tag.getClosingTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
180 tagStack.push(tag);
036535fcd179 anteater
jdamerow
parents:
diff changeset
181 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
182 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
183 configuration.getLogger().logMessage(COMPONENT_NAME,"Couldn't insert into summary: " + tag.getTag() + " at " + tag.getPosition());
036535fcd179 anteater
jdamerow
parents:
diff changeset
184 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
185 while (!tagStack.isEmpty()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
186 AnnotationTag topTag = tagStack.pop();
036535fcd179 anteater
jdamerow
parents:
diff changeset
187 //text.insert(topTag.getPosition(), topTag.getTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
188 if (topTag.getPosition() < end) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
189 finalText.insert(0, StringEscapeUtils.escapeXml(text.substring(topTag.getPosition(), end)));
036535fcd179 anteater
jdamerow
parents:
diff changeset
190 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
191 finalText.insert(0, topTag.getTag());
036535fcd179 anteater
jdamerow
parents:
diff changeset
192 end = topTag.getPosition();
036535fcd179 anteater
jdamerow
parents:
diff changeset
193 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
194 finalText.insert(0, text.substring(0, end));
036535fcd179 anteater
jdamerow
parents:
diff changeset
195 return finalText;
036535fcd179 anteater
jdamerow
parents:
diff changeset
196 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
197
036535fcd179 anteater
jdamerow
parents:
diff changeset
198
036535fcd179 anteater
jdamerow
parents:
diff changeset
199 protected List<AnnotationTag> getParagraphTags(TextPart part) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
200 int counter = 0;
036535fcd179 anteater
jdamerow
parents:
diff changeset
201 List<AnnotationTag> tags = new ArrayList<AnnotationTag>();
036535fcd179 anteater
jdamerow
parents:
diff changeset
202
036535fcd179 anteater
jdamerow
parents:
diff changeset
203 for (Paragraph para : part.getParagraphsOfInterest()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
204 AnnotationTag opentag = new AnnotationTag("<p type=\"" + para.getParagraphType() + "\">", counter, "</p>", counter + para.getParagraphText().length(), "paragraph");
036535fcd179 anteater
jdamerow
parents:
diff changeset
205 tags.add(opentag);
036535fcd179 anteater
jdamerow
parents:
diff changeset
206 // add 1 for "\n"
036535fcd179 anteater
jdamerow
parents:
diff changeset
207 counter += para.getParagraphText().length() + 1;
036535fcd179 anteater
jdamerow
parents:
diff changeset
208 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
209
036535fcd179 anteater
jdamerow
parents:
diff changeset
210 return tags;
036535fcd179 anteater
jdamerow
parents:
diff changeset
211 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
212
036535fcd179 anteater
jdamerow
parents:
diff changeset
213 }