Mercurial > hg > anteater
changeset 3:ae96e4bc7fb2
save found species to analysis files
author | jdamerow |
---|---|
date | Mon, 22 Oct 2012 14:21:14 -0700 |
parents | 1c2b4f5e2c05 |
children | dcc35f89dce3 |
files | analysis/00-18565.xml analysis/01-14522.xml analysis/01-19062.xml analysis/2010-23822.xml src/de/mpiwg/anteater/AnteaterController.java src/de/mpiwg/anteater/species/common/CommonNameFindController.java src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java src/de/mpiwg/anteater/text/TextInformation.java src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml |
diffstat | 10 files changed, 165 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/analysis/00-18565.xml Mon Oct 22 13:46:54 2012 -0700 +++ b/analysis/00-18565.xml Mon Oct 22 14:21:14 2012 -0700 @@ -85,6 +85,8 @@ </result> </supplementary_information> </scientificNames> + <commonNames> + </commonNames> <places> <summaries> <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/01-14522.xml Mon Oct 22 13:46:54 2012 -0700 +++ b/analysis/01-14522.xml Mon Oct 22 14:21:14 2012 -0700 @@ -81,6 +81,22 @@ </result> </supplementary_information> </scientificNames> + <commonNames> + <summaries> + <linnaeus /> + <linnaeus /> + </summaries> + <supplementary_information> + <linnaeus> + <species id="species:ncbi:34886" start="1651" end="1668" text="Steller sea lions" /> + <species id="species:ncbi:34886" start="1671" end="1689" text="Eumetopias jubatus" /> + <species id="species:ncbi:34886" start="2561" end="2577" text="Steller Sea Lion" /> + <species id="species:ncbi:34886" start="7324" end="7341" text="Steller sea lions" /> + <species id="species:ncbi:34886" start="7344" end="7362" text="Eumetopias jubatus" /> + <species id="species:ncbi:34886" start="8234" end="8250" text="Steller Sea Lion" /> + </linnaeus> + </supplementary_information> + </commonNames> <places> <summaries> <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/01-19062.xml Mon Oct 22 13:46:54 2012 -0700 +++ b/analysis/01-19062.xml Mon Oct 22 14:21:14 2012 -0700 @@ -89,6 +89,29 @@ </result> </supplementary_information> </scientificNames> + <commonNames> + <summaries> + <linnaeus /> + </summaries> + <supplementary_information> + <linnaeus> + <species id="species:ncbi:34886" start="755" end="772" text="Steller sea lions" /> + <species id="species:ncbi:34886" start="868" end="885" text="Steller sea lions" /> + <species id="species:ncbi:9742" start="2299" end="2315" text="harbor porpoises" /> + <species id="species:ncbi:9742" start="2318" end="2335" text="Phocoena phocoena" /> + <species id="species:ncbi:27606" start="2934" end="2961" text="North Atlantic right whales" /> + <species id="species:ncbi:27606" start="2964" end="2983" text="Eubalaena glacialis" /> + <species id="species:ncbi:27606" start="3241" end="3268" text="North Atlantic right whales" /> + <species id="species:ncbi:34886" start="6627" end="6644" text="Steller sea lions" /> + <species id="species:ncbi:34886" start="6740" end="6757" text="Steller sea lions" /> + <species id="species:ncbi:9742" start="8171" end="8187" text="harbor porpoises" /> + <species id="species:ncbi:9742" start="8190" end="8207" text="Phocoena phocoena" /> + <species id="species:ncbi:27606" start="8806" end="8833" text="North Atlantic right whales" /> + <species id="species:ncbi:27606" start="8836" end="8855" text="Eubalaena glacialis" /> + <species id="species:ncbi:27606" start="9113" end="9140" text="North Atlantic right whales" /> + </linnaeus> + </supplementary_information> + </commonNames> <places> <summaries> <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/2010-23822.xml Mon Oct 22 13:46:54 2012 -0700 +++ b/analysis/2010-23822.xml Mon Oct 22 14:21:14 2012 -0700 @@ -132,6 +132,44 @@ </result> </supplementary_information> </scientificNames> + <commonNames> + <summaries> + <linnaeus> + <species id="species:ncbi:36500" start="27" end="34" text="S. Fish" /> + <species id="species:ncbi:36500" start="494" end="501" text="S. Fish" /> + </linnaeus> + </summaries> + <supplementary_information> + <linnaeus> + <species id="species:ncbi:30601" start="3169" end="3190" text="golden-crowned sifaka" /> + <species id="species:ncbi:30601" start="3193" end="3216" text="Propithecus tattersalli" /> + <species id="species:ncbi:48399" start="3525" end="3543" text="American crocodile" /> + <species id="species:ncbi:48399" start="3546" end="3563" text="Crocodylus acutus" /> + <species id="species:ncbi:184237" start="3571" end="3590" text="Morelet's crocodile" /> + <species id="species:ncbi:37171" start="3901" end="3909" text="bontebok" /> + <species id="species:ncbi:36500" start="4280" end="4287" text="S. Fish" /> + <species id="species:ncbi:29073" start="4445" end="4456" text="polar bears" /> + <species id="species:ncbi:29073" start="4459" end="4474" text="Ursus maritimus" /> + <species id="species:ncbi:9708" start="4773" end="4787" text="Pacific walrus" /> + <species id="species:ncbi:9708" start="4791" end="4818" text="Odobenus rosmarus divergens" /> + <species id="species:ncbi:9778" start="5132" end="5148" text="Florida manatees" /> + <species id="species:ncbi:9778" start="5151" end="5169" text="Trichechus manatus" /> + <species id="species:ncbi:30601" start="8687" end="8708" text="golden-crowned sifaka" /> + <species id="species:ncbi:30601" start="8711" end="8734" text="Propithecus tattersalli" /> + <species id="species:ncbi:48399" start="9043" end="9061" text="American crocodile" /> + <species id="species:ncbi:48399" start="9064" end="9081" text="Crocodylus acutus" /> + <species id="species:ncbi:184237" start="9089" end="9108" text="Morelet's crocodile" /> + <species id="species:ncbi:37171" start="9419" end="9427" text="bontebok" /> + <species id="species:ncbi:36500" start="9798" end="9805" text="S. Fish" /> + <species id="species:ncbi:29073" start="9963" end="9974" text="polar bears" /> + <species id="species:ncbi:29073" start="9977" end="9992" text="Ursus maritimus" /> + <species id="species:ncbi:9708" start="10291" end="10305" text="Pacific walrus" /> + <species id="species:ncbi:9708" start="10309" end="10336" text="Odobenus rosmarus divergens" /> + <species id="species:ncbi:9778" start="10650" end="10666" text="Florida manatees" /> + <species id="species:ncbi:9778" start="10669" end="10687" text="Trichechus manatus" /> + </linnaeus> + </supplementary_information> + </commonNames> <places> <summaries> <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/src/de/mpiwg/anteater/AnteaterController.java Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/AnteaterController.java Mon Oct 22 14:21:14 2012 -0700 @@ -12,6 +12,8 @@ import de.mpiwg.anteater.places.PlacesExtraction; import de.mpiwg.anteater.results.ResultController; import de.mpiwg.anteater.results.ResultsCarrier; +import de.mpiwg.anteater.species.common.CommonNameFindController; +import de.mpiwg.anteater.species.common.CommonNamesExtraction; import de.mpiwg.anteater.species.scientific.ScientificNameFindController; import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; import de.mpiwg.anteater.text.TextInformation; @@ -39,6 +41,7 @@ ScientificNameFindController scienceNameFindController = new ScientificNameFindController(configuration); PlaceFinderController placesController = new PlaceFinderController(configuration); PersonFinderController personsController = new PersonFinderController(configuration); + CommonNameFindController commonNameFindController = new CommonNameFindController(configuration); List<TextInformation> textInformations = new ArrayList<TextInformation>(); TextManager textManager = new TextManager(configuration); @@ -55,6 +58,10 @@ List<ScientificNamesExtraction> scienNameResults = scienceNameFindController.findScientificNamesInXML(info); info.setScientificNamesExtractions(scienNameResults); + //get common names and scientiric + List<CommonNamesExtraction> commonNamesResults = commonNameFindController.findCommonNamesInXML(info); + info.setCommonNamesExtractions(commonNamesResults); + // get places List<PlacesExtraction> placesResults = placesController.findPlacesInXML(info); info.setPlacesExtractions(placesResults);
--- a/src/de/mpiwg/anteater/species/common/CommonNameFindController.java Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/species/common/CommonNameFindController.java Mon Oct 22 14:21:14 2012 -0700 @@ -5,8 +5,12 @@ import java.util.List; import de.mpiwg.anteater.AnteaterConfiguration; +import de.mpiwg.anteater.species.common.impl.LinnaeusNameFinder; +import de.mpiwg.anteater.species.scientific.IScientificNamesFinder; import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; +import de.mpiwg.anteater.species.scientific.impl.GNRDNameFinder; import de.mpiwg.anteater.text.TextInformation; +import de.mpiwg.anteater.text.TextPart; import de.mpiwg.anteater.xml.impl.AnalysisXMLManager; public class CommonNameFindController { @@ -19,7 +23,7 @@ this.configuration = configuration; } - public List<ScientificNamesExtraction> findCommonNamesInXML(TextInformation info) { + public List<CommonNamesExtraction> findCommonNamesInXML(TextInformation info) { List<CommonNamesExtraction> results = new ArrayList<CommonNamesExtraction>(); List<String> summaryAnalysisResults = new ArrayList<String>(); List<String> supplinfAnalysisResults = new ArrayList<String>(); @@ -40,6 +44,40 @@ configuration.getLogger().logMessage("found " + supplinfAnalysisResults.size() + " result(s)."); } + ICommonNameFinder nameFinder = new LinnaeusNameFinder(configuration.getLogger()); + + // if there are no results for summaries, ask GNRD name finding service. + if (summaryAnalysisResults.size() == 0) { + configuration.getLogger().logMessage(COMPONENT_NAME, "No results found for summaries, so will ask LinnaeusNameFinder."); + + for (TextPart sum : info.getSummaries()) { + String sumResult = nameFinder.findCommonNames(sum.getText()); + if (sumResult != null) { + summaryAnalysisResults.add(sumResult); + + // if there is an analysis folder, add result to analysis file + if (analysisManager != null) + analysisManager.addSummaryCommonNamesResult(sumResult); + } + } + } + + // if there are no results for supplementary information, ask GNRD name fining service + if (supplinfAnalysisResults.size() == 0) { + configuration.getLogger().logMessage(COMPONENT_NAME, "No results found for supplementary information, so will ask LinnaeusNameFinder."); + + for (TextPart sInf : info.getSupplInfos()) { + String supinfResult = nameFinder.findCommonNames(sInf.getText()); + if (supinfResult != null) { + supplinfAnalysisResults.add(supinfResult); + + // if there is an analysis folder, add result to analysis file + if (analysisManager != null) + analysisManager.addSupplInfCommonNamesResult(supinfResult); + } + } + } + return null; }
--- a/src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java Mon Oct 22 14:21:14 2012 -0700 @@ -2,12 +2,11 @@ import java.io.InputStream; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.logging.Logger; -import martin.common.ArgParser; +import org.apache.commons.lang3.StringEscapeUtils; + import uk.ac.man.documentparser.dataholders.Document; import uk.ac.man.entitytagger.Mention; import uk.ac.man.entitytagger.doc.TaggedDocument; @@ -48,9 +47,20 @@ Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null); TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc); List<Mention> species = tagged.getAllMatches(); - for (Mention s : species) - System.out.println("found " + s.getMostProbableID() + ": " + s.getText() + " at " + s.getStart()); - return null; + StringBuffer sb = new StringBuffer(); + sb.append("<linnaeus>"); + + for (Mention s : species) { + sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" "); + sb.append("start=\"" + s.getStart() + "\" "); + sb.append("end=\"" + s.getEnd() + "\" "); + sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" "); + sb.append("/>"); + } + + sb.append("</linnaeus>"); + System.out.println(sb.toString()); + return sb.toString(); } public Postprocessor getPostprocessor(Map<String, String> comments,
--- a/src/de/mpiwg/anteater/text/TextInformation.java Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/text/TextInformation.java Mon Oct 22 14:21:14 2012 -0700 @@ -4,6 +4,7 @@ import de.mpiwg.anteater.persons.PersonsExtraction; import de.mpiwg.anteater.places.PlacesExtraction; +import de.mpiwg.anteater.species.common.CommonNamesExtraction; import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction; public class TextInformation { @@ -13,6 +14,7 @@ private List<ScientificNamesExtraction> scientificNamesExtractions; private List<PlacesExtraction> placesExtractions; private List<PersonsExtraction> personsExtractions; + private List<CommonNamesExtraction> commonNamesExtractions; private String filepath; public void setSummaries(List<TextPart> summaries) { @@ -51,4 +53,10 @@ public List<PersonsExtraction> getPersonsExtractions() { return personsExtractions; } + public void setCommonNamesExtractions(List<CommonNamesExtraction> commonNamesExtractions) { + this.commonNamesExtractions = commonNamesExtractions; + } + public List<CommonNamesExtraction> getCommonNamesExtractions() { + return commonNamesExtractions; + } }
--- a/src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java Mon Oct 22 14:21:14 2012 -0700 @@ -124,6 +124,14 @@ public void addSupplInfNamesResult(String result) { addElementToDoc(result, "/analysis/scientificNames/supplementary_information"); } + + public void addSummaryCommonNamesResult(String result) { + addElementToDoc(result, "/analysis/commonNames/summaries"); + } + + public void addSupplInfCommonNamesResult(String result) { + addElementToDoc(result, "/analysis/commonNames/supplementary_information"); + } private void addElementToDoc(String result, String xpath) { IXMLParser parser = new JDOMParser(result, false);
--- a/src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml Mon Oct 22 13:46:54 2012 -0700 +++ b/src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml Mon Oct 22 14:21:14 2012 -0700 @@ -15,6 +15,14 @@ </supplementary_information> </scientificNames> + <commonNames> + <summaries> + + </summaries> + <supplementary_information> + + </supplementary_information> + </commonNames> <places> <summaries>