changeset 3:ae96e4bc7fb2

save found species to analysis files
author jdamerow
date Mon, 22 Oct 2012 14:21:14 -0700
parents 1c2b4f5e2c05
children dcc35f89dce3
files analysis/00-18565.xml analysis/01-14522.xml analysis/01-19062.xml analysis/2010-23822.xml src/de/mpiwg/anteater/AnteaterController.java src/de/mpiwg/anteater/species/common/CommonNameFindController.java src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java src/de/mpiwg/anteater/text/TextInformation.java src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml
diffstat 10 files changed, 165 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/analysis/00-18565.xml	Mon Oct 22 13:46:54 2012 -0700
+++ b/analysis/00-18565.xml	Mon Oct 22 14:21:14 2012 -0700
@@ -85,6 +85,8 @@
       </result>
     </supplementary_information>
   </scientificNames>
+  <commonNames>
+  </commonNames>
   <places>
     <summaries>
       <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/01-14522.xml	Mon Oct 22 13:46:54 2012 -0700
+++ b/analysis/01-14522.xml	Mon Oct 22 14:21:14 2012 -0700
@@ -81,6 +81,22 @@
       </result>
     </supplementary_information>
   </scientificNames>
+  <commonNames>
+    <summaries>
+      <linnaeus />
+      <linnaeus />
+    </summaries>
+    <supplementary_information>
+      <linnaeus>
+        <species id="species:ncbi:34886" start="1651" end="1668" text="Steller sea lions" />
+        <species id="species:ncbi:34886" start="1671" end="1689" text="Eumetopias jubatus" />
+        <species id="species:ncbi:34886" start="2561" end="2577" text="Steller Sea Lion" />
+        <species id="species:ncbi:34886" start="7324" end="7341" text="Steller sea lions" />
+        <species id="species:ncbi:34886" start="7344" end="7362" text="Eumetopias jubatus" />
+        <species id="species:ncbi:34886" start="8234" end="8250" text="Steller Sea Lion" />
+      </linnaeus>
+    </supplementary_information>
+  </commonNames>
   <places>
     <summaries>
       <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/01-19062.xml	Mon Oct 22 13:46:54 2012 -0700
+++ b/analysis/01-19062.xml	Mon Oct 22 14:21:14 2012 -0700
@@ -89,6 +89,29 @@
       </result>
     </supplementary_information>
   </scientificNames>
+  <commonNames>
+    <summaries>
+      <linnaeus />
+    </summaries>
+    <supplementary_information>
+      <linnaeus>
+        <species id="species:ncbi:34886" start="755" end="772" text="Steller sea lions" />
+        <species id="species:ncbi:34886" start="868" end="885" text="Steller sea lions" />
+        <species id="species:ncbi:9742" start="2299" end="2315" text="harbor porpoises" />
+        <species id="species:ncbi:9742" start="2318" end="2335" text="Phocoena phocoena" />
+        <species id="species:ncbi:27606" start="2934" end="2961" text="North Atlantic right whales" />
+        <species id="species:ncbi:27606" start="2964" end="2983" text="Eubalaena glacialis" />
+        <species id="species:ncbi:27606" start="3241" end="3268" text="North Atlantic right whales" />
+        <species id="species:ncbi:34886" start="6627" end="6644" text="Steller sea lions" />
+        <species id="species:ncbi:34886" start="6740" end="6757" text="Steller sea lions" />
+        <species id="species:ncbi:9742" start="8171" end="8187" text="harbor porpoises" />
+        <species id="species:ncbi:9742" start="8190" end="8207" text="Phocoena phocoena" />
+        <species id="species:ncbi:27606" start="8806" end="8833" text="North Atlantic right whales" />
+        <species id="species:ncbi:27606" start="8836" end="8855" text="Eubalaena glacialis" />
+        <species id="species:ncbi:27606" start="9113" end="9140" text="North Atlantic right whales" />
+      </linnaeus>
+    </supplementary_information>
+  </commonNames>
   <places>
     <summaries>
       <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/analysis/2010-23822.xml	Mon Oct 22 13:46:54 2012 -0700
+++ b/analysis/2010-23822.xml	Mon Oct 22 14:21:14 2012 -0700
@@ -132,6 +132,44 @@
       </result>
     </supplementary_information>
   </scientificNames>
+  <commonNames>
+    <summaries>
+      <linnaeus>
+        <species id="species:ncbi:36500" start="27" end="34" text="S. Fish" />
+        <species id="species:ncbi:36500" start="494" end="501" text="S. Fish" />
+      </linnaeus>
+    </summaries>
+    <supplementary_information>
+      <linnaeus>
+        <species id="species:ncbi:30601" start="3169" end="3190" text="golden-crowned sifaka" />
+        <species id="species:ncbi:30601" start="3193" end="3216" text="Propithecus tattersalli" />
+        <species id="species:ncbi:48399" start="3525" end="3543" text="American crocodile" />
+        <species id="species:ncbi:48399" start="3546" end="3563" text="Crocodylus acutus" />
+        <species id="species:ncbi:184237" start="3571" end="3590" text="Morelet's crocodile" />
+        <species id="species:ncbi:37171" start="3901" end="3909" text="bontebok" />
+        <species id="species:ncbi:36500" start="4280" end="4287" text="S. Fish" />
+        <species id="species:ncbi:29073" start="4445" end="4456" text="polar bears" />
+        <species id="species:ncbi:29073" start="4459" end="4474" text="Ursus maritimus" />
+        <species id="species:ncbi:9708" start="4773" end="4787" text="Pacific walrus" />
+        <species id="species:ncbi:9708" start="4791" end="4818" text="Odobenus rosmarus divergens" />
+        <species id="species:ncbi:9778" start="5132" end="5148" text="Florida manatees" />
+        <species id="species:ncbi:9778" start="5151" end="5169" text="Trichechus manatus" />
+        <species id="species:ncbi:30601" start="8687" end="8708" text="golden-crowned sifaka" />
+        <species id="species:ncbi:30601" start="8711" end="8734" text="Propithecus tattersalli" />
+        <species id="species:ncbi:48399" start="9043" end="9061" text="American crocodile" />
+        <species id="species:ncbi:48399" start="9064" end="9081" text="Crocodylus acutus" />
+        <species id="species:ncbi:184237" start="9089" end="9108" text="Morelet's crocodile" />
+        <species id="species:ncbi:37171" start="9419" end="9427" text="bontebok" />
+        <species id="species:ncbi:36500" start="9798" end="9805" text="S. Fish" />
+        <species id="species:ncbi:29073" start="9963" end="9974" text="polar bears" />
+        <species id="species:ncbi:29073" start="9977" end="9992" text="Ursus maritimus" />
+        <species id="species:ncbi:9708" start="10291" end="10305" text="Pacific walrus" />
+        <species id="species:ncbi:9708" start="10309" end="10336" text="Odobenus rosmarus divergens" />
+        <species id="species:ncbi:9778" start="10650" end="10666" text="Florida manatees" />
+        <species id="species:ncbi:9778" start="10669" end="10687" text="Trichechus manatus" />
+      </linnaeus>
+    </supplementary_information>
+  </commonNames>
   <places>
     <summaries>
       <contentlocation xmlns="http://wherein.yahooapis.com/v1/schema" xmlns:yahoo="http://www.yahooapis.com/v1/base.rng" xml:lang="en">
--- a/src/de/mpiwg/anteater/AnteaterController.java	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/AnteaterController.java	Mon Oct 22 14:21:14 2012 -0700
@@ -12,6 +12,8 @@
 import de.mpiwg.anteater.places.PlacesExtraction;
 import de.mpiwg.anteater.results.ResultController;
 import de.mpiwg.anteater.results.ResultsCarrier;
+import de.mpiwg.anteater.species.common.CommonNameFindController;
+import de.mpiwg.anteater.species.common.CommonNamesExtraction;
 import de.mpiwg.anteater.species.scientific.ScientificNameFindController;
 import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
 import de.mpiwg.anteater.text.TextInformation;
@@ -39,6 +41,7 @@
 		ScientificNameFindController scienceNameFindController = new ScientificNameFindController(configuration);
 		PlaceFinderController placesController = new PlaceFinderController(configuration);
 		PersonFinderController personsController = new PersonFinderController(configuration);
+		CommonNameFindController commonNameFindController = new CommonNameFindController(configuration);
 		
 		List<TextInformation> textInformations = new ArrayList<TextInformation>();
 		TextManager textManager = new TextManager(configuration);
@@ -55,6 +58,10 @@
 			List<ScientificNamesExtraction> scienNameResults = scienceNameFindController.findScientificNamesInXML(info);
 			info.setScientificNamesExtractions(scienNameResults);
 			
+			//get common names and scientiric
+			List<CommonNamesExtraction> commonNamesResults = commonNameFindController.findCommonNamesInXML(info);
+			info.setCommonNamesExtractions(commonNamesResults);
+			
 			// get places
 			List<PlacesExtraction> placesResults = placesController.findPlacesInXML(info);
 			info.setPlacesExtractions(placesResults);
--- a/src/de/mpiwg/anteater/species/common/CommonNameFindController.java	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/species/common/CommonNameFindController.java	Mon Oct 22 14:21:14 2012 -0700
@@ -5,8 +5,12 @@
 import java.util.List;
 
 import de.mpiwg.anteater.AnteaterConfiguration;
+import de.mpiwg.anteater.species.common.impl.LinnaeusNameFinder;
+import de.mpiwg.anteater.species.scientific.IScientificNamesFinder;
 import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
+import de.mpiwg.anteater.species.scientific.impl.GNRDNameFinder;
 import de.mpiwg.anteater.text.TextInformation;
+import de.mpiwg.anteater.text.TextPart;
 import de.mpiwg.anteater.xml.impl.AnalysisXMLManager;
 
 public class CommonNameFindController {
@@ -19,7 +23,7 @@
 		this.configuration = configuration;
 	}
 	
-	public List<ScientificNamesExtraction> findCommonNamesInXML(TextInformation info) {
+	public List<CommonNamesExtraction> findCommonNamesInXML(TextInformation info) {
 		List<CommonNamesExtraction> results = new ArrayList<CommonNamesExtraction>();
 		List<String> summaryAnalysisResults = new ArrayList<String>();
 		List<String> supplinfAnalysisResults = new ArrayList<String>();
@@ -40,6 +44,40 @@
 			configuration.getLogger().logMessage("found " + supplinfAnalysisResults.size() + " result(s).");
 		}
 		
+		ICommonNameFinder nameFinder = new LinnaeusNameFinder(configuration.getLogger());
+		
+		// if there are no results for summaries, ask GNRD name finding service.
+		if (summaryAnalysisResults.size() == 0) {
+			configuration.getLogger().logMessage(COMPONENT_NAME, "No results found for summaries, so will ask LinnaeusNameFinder.");
+						
+			for (TextPart sum : info.getSummaries()) {
+				String sumResult = nameFinder.findCommonNames(sum.getText());
+				if (sumResult != null) {
+					summaryAnalysisResults.add(sumResult);
+					
+					// if there is an analysis folder, add result to analysis file
+					if (analysisManager != null)
+						analysisManager.addSummaryCommonNamesResult(sumResult);
+				}
+			}
+		}
+		
+		// if there are no results for supplementary information, ask GNRD name fining service
+		if (supplinfAnalysisResults.size() == 0) {
+			configuration.getLogger().logMessage(COMPONENT_NAME, "No results found for supplementary information, so will ask LinnaeusNameFinder.");
+			
+			for (TextPart sInf : info.getSupplInfos()) {
+				String supinfResult = nameFinder.findCommonNames(sInf.getText());
+				if (supinfResult != null) {
+					supplinfAnalysisResults.add(supinfResult);
+					
+					// if there is an analysis folder, add result to analysis file
+					if (analysisManager != null)
+						analysisManager.addSupplInfCommonNamesResult(supinfResult);
+				}
+			}
+		}
+		
 		return null;
 	}
 	
--- a/src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/species/common/impl/LinnaeusNameFinder.java	Mon Oct 22 14:21:14 2012 -0700
@@ -2,12 +2,11 @@
 
 import java.io.InputStream;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.logging.Logger;
 
-import martin.common.ArgParser;
+import org.apache.commons.lang3.StringEscapeUtils;
+
 import uk.ac.man.documentparser.dataholders.Document;
 import uk.ac.man.entitytagger.Mention;
 import uk.ac.man.entitytagger.doc.TaggedDocument;
@@ -48,9 +47,20 @@
 		Document doc = new Document("id", "title", "", text, text, Document.Text_raw_type.TEXT, "", null, Document.Type.OTHER, null, "", "", "", "", null);
 		TaggedDocument tagged = MatchOperations.matchDocument(matcher, doc);
 		List<Mention> species = tagged.getAllMatches();
-		for (Mention s : species)
-			System.out.println("found " + s.getMostProbableID() + ": " + s.getText() + " at " + s.getStart());
-		return null;
+		StringBuffer sb = new StringBuffer();
+		sb.append("<linnaeus>");
+		
+		for (Mention s : species) {
+			sb.append("<species id=\"" + StringEscapeUtils.escapeXml(s.getMostProbableID()) + "\" ");
+			sb.append("start=\"" + s.getStart() + "\" ");
+			sb.append("end=\"" + s.getEnd() + "\" ");
+			sb.append("text=\"" + StringEscapeUtils.escapeXml(s.getText()) + "\" ");
+			sb.append("/>");
+		}
+		
+		sb.append("</linnaeus>");
+		System.out.println(sb.toString());
+		return sb.toString();
 	}
 
 	public Postprocessor getPostprocessor(Map<String, String> comments,
--- a/src/de/mpiwg/anteater/text/TextInformation.java	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/text/TextInformation.java	Mon Oct 22 14:21:14 2012 -0700
@@ -4,6 +4,7 @@
 
 import de.mpiwg.anteater.persons.PersonsExtraction;
 import de.mpiwg.anteater.places.PlacesExtraction;
+import de.mpiwg.anteater.species.common.CommonNamesExtraction;
 import de.mpiwg.anteater.species.scientific.ScientificNamesExtraction;
 
 public class TextInformation {
@@ -13,6 +14,7 @@
 	private List<ScientificNamesExtraction> scientificNamesExtractions;
 	private List<PlacesExtraction> placesExtractions;
 	private List<PersonsExtraction> personsExtractions;
+	private List<CommonNamesExtraction> commonNamesExtractions;
 	private String filepath;
 	
 	public void setSummaries(List<TextPart> summaries) {
@@ -51,4 +53,10 @@
 	public List<PersonsExtraction> getPersonsExtractions() {
 		return personsExtractions;
 	}
+	public void setCommonNamesExtractions(List<CommonNamesExtraction> commonNamesExtractions) {
+		this.commonNamesExtractions = commonNamesExtractions;
+	}
+	public List<CommonNamesExtraction> getCommonNamesExtractions() {
+		return commonNamesExtractions;
+	}
 }
--- a/src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/xml/impl/AnalysisXMLManager.java	Mon Oct 22 14:21:14 2012 -0700
@@ -124,6 +124,14 @@
 	public void addSupplInfNamesResult(String result) {
 		addElementToDoc(result, "/analysis/scientificNames/supplementary_information");
 	}
+	
+	public void addSummaryCommonNamesResult(String result) {
+		addElementToDoc(result, "/analysis/commonNames/summaries");
+	}
+	
+	public void addSupplInfCommonNamesResult(String result) {
+		addElementToDoc(result, "/analysis/commonNames/supplementary_information");
+	}
 
 	private void addElementToDoc(String result, String xpath) {
 		IXMLParser parser = new JDOMParser(result, false);
--- a/src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml	Mon Oct 22 13:46:54 2012 -0700
+++ b/src/de/mpiwg/anteater/xml/impl/templates/analysisFile.xml	Mon Oct 22 14:21:14 2012 -0700
@@ -15,6 +15,14 @@
 			
 		</supplementary_information>
 	</scientificNames>
+	<commonNames>
+		<summaries>
+			
+		</summaries>
+		<supplementary_information>
+			
+		</supplementary_information>
+    </commonNames>
 	<places>
 		<summaries>